Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/windirstat/llfio.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiall Douglas (s [underscore] sourceforge {at} nedprod [dot] com) <spam@nowhere>2016-03-21 02:41:51 +0300
committerNiall Douglas (s [underscore] sourceforge {at} nedprod [dot] com) <spam@nowhere>2016-03-21 02:41:51 +0300
commit758a934ab266ed660daa54b72e4606b78e374071 (patch)
tree6f2fe1c5d2b8331f9319549bc6f0c3390168eb6b /reference
AFIO v2: Relocate all the AFIO v2 files in fs_probe into the root hierarchy. AFIO v2 is now the master branch!
Diffstat (limited to 'reference')
-rw-r--r--reference/11.1.pdfbin0 -> 971081 bytes
-rw-r--r--reference/54d0f0190cf29ca811040c8a.pdfbin0 -> 364048 bytes
-rw-r--r--reference/Chidambaram.pdfbin0 -> 563249 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct.html2434
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-analysis-cdf.gifbin0 -> 6451 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-method-edp.gifbin0 -> 7633 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-big-legend.gifbin0 -> 12226 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-big.gifbin0 -> 40604 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-small-ext3.gifbin0 -> 18507 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-small-hfsplus.gifbin0 -> 19125 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-small-jfs.gifbin0 -> 20197 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-small-nfs.gifbin0 -> 22061 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-small-reiserfs.gifbin0 -> 23507 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-small-xfs.gifbin0 -> 96047 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/fig-result-zoom.gifbin0 -> 5942 bytes
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/main.css210
-rw-r--r--reference/Error Handling is Ocassionally Correct_files/new_usenix.jpgbin0 -> 20617 bytes
-rw-r--r--reference/Files are hard.html484
-rw-r--r--reference/Files are hard_files/fs_properties.pngbin0 -> 34362 bytes
-rw-r--r--reference/Files are hard_files/program_bugs.pngbin0 -> 85886 bytes
-rw-r--r--reference/Linux KAIO/History of Linux KAIO API.pdfbin0 -> 145418 bytes
-rw-r--r--reference/Linux KAIO/KAIOUserGuide.htm733
-rw-r--r--reference/Linux KAIO/linux-kaio.txt552
-rw-r--r--reference/iron-sosp05.pdfbin0 -> 277604 bytes
-rw-r--r--reference/osdi14-paper-pillai.pdfbin0 -> 453389 bytes
25 files changed, 4413 insertions, 0 deletions
diff --git a/reference/11.1.pdf b/reference/11.1.pdf
new file mode 100644
index 00000000..9a36eb1b
--- /dev/null
+++ b/reference/11.1.pdf
Binary files differ
diff --git a/reference/54d0f0190cf29ca811040c8a.pdf b/reference/54d0f0190cf29ca811040c8a.pdf
new file mode 100644
index 00000000..1f703f04
--- /dev/null
+++ b/reference/54d0f0190cf29ca811040c8a.pdf
Binary files differ
diff --git a/reference/Chidambaram.pdf b/reference/Chidambaram.pdf
new file mode 100644
index 00000000..5957afda
--- /dev/null
+++ b/reference/Chidambaram.pdf
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct.html b/reference/Error Handling is Ocassionally Correct.html
new file mode 100644
index 00000000..ff3c38a1
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct.html
@@ -0,0 +1,2434 @@
+
+<!-- saved from url=(0089)https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html -->
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1252"><script src="https://js-agent.newrelic.com/nr-885.min.js"></script><script type="text/javascript">window.NREUM||(NREUM={}),__nr_require=function(e,t,n){function r(n){if(!t[n]){var o=t[n]={exports:{}};e[n][0].call(o.exports,function(t){var o=e[n][1][t];return r(o||t)},o,o.exports)}return t[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({QJf3ax:[function(e,t){function n(){}function r(e){function t(e){return e&&e instanceof n?e:e?a(e,i,o):o()}function s(n,r,o){e&&e(n,r,o);for(var i=t(o),a=l(n),u=a.length,f=0;u>f;f++)a[f].apply(i,r);var s=c[w[n]];return s&&s.push([h,n,r,i]),i}function p(e,t){g[e]=l(e).concat(t)}function l(e){return g[e]||[]}function d(e){return f[e]=f[e]||r(s)}function v(e,t){u(e,function(e,n){t=t||"feature",w[n]=t,t in c||(c[t]=[])})}var g={},w={},h={on:p,emit:s,get:d,listeners:l,context:t,buffer:v};return h}function o(){return new n}var i="nr@context",a=e("gos"),u=e(1),c={},f={},s=t.exports=r();s.backlog=c},{1:12,gos:"7eSDFh"}],ee:[function(e,t){t.exports=e("QJf3ax")},{}],3:[function(e,t){function n(e,t){return function(){r(e,[(new Date).getTime()].concat(i(arguments)),null,t)}}var r=e("handle"),o=e(1),i=e(2);"undefined"==typeof window.newrelic&&(newrelic=NREUM);var a=["setPageViewName","addPageAction","setCustomAttribute","finished","addToTrace","inlineHit"],u=["addPageAction"],c="api-";o(a,function(e,t){newrelic[t]=n(c+t,"api")}),o(u,function(e,t){newrelic[t]=n(c+t)}),t.exports=newrelic,newrelic.noticeError=function(e){"string"==typeof e&&(e=new Error(e)),r("err",[e,(new Date).getTime()])}},{1:12,2:13,handle:"D5DuLP"}],gos:[function(e,t){t.exports=e("7eSDFh")},{}],"7eSDFh":[function(e,t){function n(e,t,n){if(r.call(e,t))return e[t];var o=n();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(e,t,{value:o,writable:!0,enumerable:!1}),o}catch(i){}return e[t]=o,o}var r=Object.prototype.hasOwnProperty;t.exports=n},{}],handle:[function(e,t){t.exports=e("D5DuLP")},{}],D5DuLP:[function(e,t){function n(e,t,n,o){r.buffer([e],o),r.emit(e,t,n)}var r=e("ee").get("handle");t.exports=n,n.ee=r},{ee:"QJf3ax"}],XL7HBI:[function(e,t){function n(e){var t=typeof e;return!e||"object"!==t&&"function"!==t?-1:e===window?0:i(e,o,function(){return r++})}var r=1,o="nr@id",i=e("gos");t.exports=n},{gos:"7eSDFh"}],id:[function(e,t){t.exports=e("XL7HBI")},{}],G9z0Bl:[function(e,t){function n(){if(!v++){var e=d.info=NREUM.info,t=f.getElementsByTagName("script")[0];if(e&&e.licenseKey&&e.applicationID&&t){u(p,function(t,n){e[t]||(e[t]=n)});var n="https"===s.split(":")[0]||e.sslForHttp;d.proto=n?"https://":"http://",a("mark",["onload",i()],null,"api");var r=f.createElement("script");r.src=d.proto+e.agent,t.parentNode.insertBefore(r,t)}}}function r(){"complete"===f.readyState&&o()}function o(){a("mark",["domContent",i()],null,"api")}function i(){return(new Date).getTime()}var a=e("handle"),u=e(1),c=window,f=c.document;NREUM.o={ST:setTimeout,XHR:c.XMLHttpRequest,REQ:c.Request,EV:c.Event,PR:c.Promise,MO:c.MutationObserver},e(2);var s=(""+location).split("?")[0],p={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-885.min.js"},l=window.XMLHttpRequest&&XMLHttpRequest.prototype&&XMLHttpRequest.prototype.addEventListener&&!/CriOS/.test(navigator.userAgent),d=t.exports={offset:i(),origin:s,features:{},xhrWrappable:l};f.addEventListener?(f.addEventListener("DOMContentLoaded",o,!1),c.addEventListener("load",n,!1)):(f.attachEvent("onreadystatechange",r),c.attachEvent("onload",n)),a("mark",["firstbyte",i()],null,"api");var v=0},{1:12,2:3,handle:"D5DuLP"}],loader:[function(e,t){t.exports=e("G9z0Bl")},{}],12:[function(e,t){function n(e,t){var n=[],o="",i=0;for(o in e)r.call(e,o)&&(n[i]=t(o,e[o]),i+=1);return n}var r=Object.prototype.hasOwnProperty;t.exports=n},{}],13:[function(e,t){function n(e,t,n){t||(t=0),"undefined"==typeof n&&(n=e?e.length:0);for(var r=-1,o=n-t||0,i=Array(0>o?0:o);++r<o;)i[r]=e[t+r];return i}t.exports=n},{}]},{},["G9z0Bl"]);</script>
+<title>main</title>
+<meta name="description" content="main">
+<meta name="keywords" content="main">
+<meta name="resource-type" content="document">
+<meta name="distribution" content="global">
+
+<meta name="Generator" content="LaTeX2HTML v2002-2-1">
+<meta http-equiv="Content-Style-Type" content="text/css">
+
+<link rel="STYLESHEET" href="./Error Handling is Ocassionally Correct_files/main.css">
+
+</head>
+
+<body><a href="http://www.usenix.org/"><img src="./Error Handling is Ocassionally Correct_files/new_usenix.jpg" width="288" height="232" alt="Check out the new USENIX Web site." align="right"></a>
+
+
+
+
+
+<h1 align="CENTER">EIO: <u>E</u>rror Handling <u>i</u>s <u>O</u>ccasionally Correct</h1><div>
+
+<p align="CENTER"><strong>Haryadi S. Gunawi, Cindy Rubio-González,</strong><br>
+<strong>Andrea C. Arpaci-Dusseau, Remzi H. Arpaci-Dusseau, Ben Liblit</strong></p>
+<p align="CENTER"><em>Computer Sciences Department, University of Wisconsin-Madison</em> </p>
+</div>
+
+
+<h1>Abstract</h1>
+
+<p>
+<em>The reliability of file systems depends in part on how well they
+propagate errors. We develop a static analysis technique, EDP, that
+analyzes how file systems and storage device drivers propagate error
+codes. Running our EDP analysis on all file systems and 3 major
+storage device drivers in Linux 2.6, we find that errors are often
+incorrectly propagated; 1153 calls (13%) drop an error code without
+handling it.
+</em>
+</p><p>
+<em>We perform a set of analyses to rank the robustness of each subsystem
+based on the completeness of its error propagation; we find that many
+popular file systems are less robust than other available choices. We
+confirm that write errors are neglected more often than read
+errors. We also find that many violations are not corner-case
+mistakes, but perhaps intentional choices. Finally, we show that
+inter-module calls play a part in incorrect error propagation, but
+that chained propagations do not. In conclusion, error propagation
+appears complex and hard to perform correctly in modern systems. </em>
+
+
+</p><h1><a name="SECTION00020000000000000000"></a>
+<a name="sec-intro"></a><br>
+1 Introduction
+</h1>
+
+<p>
+The robustness of file systems and storage systems is a major concern,
+and rightly so&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#YangEtAl04-FSErrors">32</a>]. Recent work has shown that
+file systems are especially unreliable when the underlying disk system
+does not behave as expected&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#PrabhakaranEtAl05-SOSP">20</a>].
+Specifically, many modern commodity file systems, such as Linux
+ext3&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#Tweedie98-JournalingExt2">31</a>],
+ReiserFS&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#Reiser04-ReiserFS">23</a>], IBM's JFS&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#Best00-JFS-Local">1</a>], and
+Windows NTFS&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#Solomon98-NT">27</a>], all have serious bugs and
+inconsistencies in how they handle errors from the storage system.
+However, the question remains unanswered as to why these
+fault-handling bugs are present.
+
+</p><p>
+In this paper, we investigate what we believe is one of the root
+causes of deficient fault handling: <em>incorrect error code
+propagation</em>. To be properly handled, a low-level error code (<i>e.g.</i>, an
+"I/O error" returned from a device driver) must be correctly
+propagated to the appropriate code in the file system. Further, if the
+file system is unable to recover from the fault, it may wish to pass
+the error up to the application, again requiring correct error
+propagation.
+
+</p><p>
+Without correct error propagation, any comprehensive failure policy is
+useless: recovery mechanisms and policies cannot be invoked if the
+error is not propagated. Incorrect error propagation has been a
+significant problem in many systems. For example, self-healing
+systems cannot heal themselves if error signals never reach the
+self-recovery
+modules&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#EllardMegquier05-DISP">6</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#SidiroglouEtAl05-STEM">26</a>], components
+behind an interface do not receive error
+notifications&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#KoopmanDeVale99-POSIX">16</a>], and distributed systems
+often obtain misleading error
+codes&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#KolaEtAl05-FaultInLDS">15</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#ThainLivny02-ErrorScope">30</a>], which
+turns into frustration for human debugging. In summary, if errors are
+not propagated, then the effort spent detecting and recovering from
+those
+errors&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#CandeaEtAl04-Reboot">4</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#CowanEtAl98-Stackguard">5</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#NeculaEtAl05-CCured">18</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#QinEtAl05-Safemem">21</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#QinEtAl05-Rx">22</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#SwiftEtAl03-Nooks">28</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#SwiftEtAl04-MoreNooks">29</a>]
+is worthless.
+
+</p><p>
+To analyze how errors are propagated in file and storage system code,
+we have developed a static source-code analysis technique. Our
+technique, named <em>Error Detection and Propagation (EDP)</em> analysis,
+shows how error codes flow through the file system and storage
+drivers. EDP performs a dataflow analysis by constructing a
+function-call graph showing how error codes propagate through return
+values and function parameters.
+
+</p><p>
+We have applied EDP analysis to all file systems and 3 major storage
+device drivers (SCSI, IDE, and Software RAID) implemented in Linux
+2.6. We find that <em>error handling is occasionally correct</em>.
+Specifically, we see that low-level errors are sometimes lost as they
+travel through the many layers of the storage subsystem: out of the
+9022 function calls through which the analyzed error codes
+propagate, we find that 1153 calls (13%) do not correctly save the
+propagated error codes.
+
+</p><p>
+Our detailed analysis enables us to make a number of conclusions.
+First, we find that the more complex the file system (in terms of both
+lines of code and number of function calls with error codes), the more
+likely it is to incorrectly propagate errors; thus, these more complex
+file systems are more likely to suffer from silent failures. Second,
+we observe that I/O write operations are more likely to neglect error
+codes than I/O read operations. Third, we find that many violations
+are not corner-case mistakes: the return codes of some functions are
+consistently ignored, which makes us suspect that the omissions are
+intentional. Finally, we show how inter-module calls play a major
+part in causing incorrect error propagation, but that chained
+propagations do not.
+
+</p><p>
+The rest of this paper is organized as follows. We describe our
+methodology and present our results in Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-method">2</a> and
+&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-result">3</a> respectively. To understand the root causes of the
+problem, we perform a set of deeper analyses in
+Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-analysis">4</a>. Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-future">5</a>
+and&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-related">6</a> discuss future work and related work
+respectively. Finally, Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-conclude">7</a> concludes.
+
+</p><h1><a name="SECTION00030000000000000000"></a>
+<a name="sec-method"></a><br>
+2 Methodology
+</h1>
+
+<p>
+To understand the propagation of error codes, we have developed
+a static analysis technique that we name <em>Error Detection and
+Propagation (EDP)</em>. In this section, we identify the components of
+Linux 2.6 that we will analyze and describe EDP.
+
+</p><p>
+
+</p><h2><a name="SECTION00031000000000000000"><br>
+2.1 Target Systems</a>
+</h2>
+
+<p>
+In this paper, we analyze how errors are propagated through the file
+systems and storage device drivers in Linux 2.6.15.4. We examine all
+Linux implementations of file systems that are located in 51
+directories. These file systems are of different types, including
+disk-based file systems,
+network file systems,
+file system protocols,
+and many others. Our analysis follows requests through the virtual
+file system and memory management layers as well. In addition to file
+systems, we also examine three major storage device drivers (SCSI,
+IDE, and software RAID), as well as all lower-level drivers. Beyond
+these subsystems, our tool can be used to analyze other Linux
+components as well.
+
+</p><p>
+
+</p><div align="CENTER">
+
+<p><a name="fig-method-edp"></a></p><div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-method-edp.gif"></div>
+<br>
+<font size="-1"><i>
+Figure 1: <b>EDP Architecture.</b>The diagram shows the
+framework for Error Detection and Propagation (EDP) analysis of file
+and storage systems code.</i></font>
+<br>
+
+</div>
+
+<p>
+
+</p><h2><a name="SECTION00032000000000000000"><br>
+2.2 EDP Analysis</a>
+</h2>
+
+<p>
+The basic mechanism of EDP is a dataflow analysis: EDP constructs a
+function-call graph covering all cases in which error codes propagate
+through return values or function parameters. To build EDP, we
+harness C Intermediate Language (CIL)&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#Necula02-CIL">19</a>]. CIL
+performs source-to-source transformation of C programs and thus can be
+used in the analysis of large complex programs such as the Linux
+kernel. The EDP analysis is written as a CIL extension in 4000 lines
+of code in the OCaml language.
+
+</p><p>
+The abstraction that we introduce in EDP is that error codes flow
+along <em>channels</em>, where a channel is the set of function calls
+between where an error code is first generated and where it is
+terminated (<i>e.g.</i>, by being either handled or dropped). As shown in
+Figure&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-method-edp">1</a>, EDP contains three major components. The
+first component identifies the error codes that will be tracked. The
+second constructs the channels along which the error codes propagate.
+Finally, the third component analyzes the channels and classifies each
+as being either complete or broken.
+
+</p><p>
+<br></p><div align="CENTER">
+<table cellpadding="3" border="1" align="CENTER">
+<tbody><tr><td><font color="#FFFFFF">-</font></td>
+<td align="CENTER" colspan="1"><font size="-1"><b>Single</b></font></td>
+<td align="CENTER" colspan="1"><font size="-1"><b>Full</b></font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+Subsystem</font></td>
+</tr>
+<tr><td align="LEFT" colspan="1"><font size="-1">
+Subsystem</font></td>
+<td align="CENTER" colspan="1"><font size="-1"><b>(seconds)</b></font></td>
+<td align="CENTER" colspan="1"><font size="-1"><b>(seconds)</b></font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+Size (Kloc)</font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+
+VFS </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>4 </b> </font></td>
+<td align="CENTER"><font size="-1"> - </font></td>
+<td align="CENTER"><font size="-1"> 34 </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Mem. Mgmt. </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>3 </b> </font></td>
+<td align="CENTER"><font size="-1"> - </font></td>
+<td align="CENTER"><font size="-1"> 20 </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+
+XFS </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>8 </b> </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>13 </b> </font></td>
+<td align="CENTER"><font size="-1"> 71 </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+ReiserFS </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>3 </b> </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>8 </b> </font></td>
+<td align="CENTER"><font size="-1"> 24 </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+ext3 </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>2 </b> </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>7 </b> </font></td>
+<td align="CENTER"><font size="-1"> 12 </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Apple HFS </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>1 </b> </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>6 </b> </font></td>
+<td align="CENTER"><font size="-1"> 5 </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+VFAT </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>1 </b> </font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>5 </b> </font></td>
+<td align="CENTER"><font size="-1"> 1 </font></td>
+</tr>
+<tr><td align="LEFT" colspan="2"><font size="-1">
+
+All File Systems Together</font></td>
+<td align="CENTER"><font size="-1"> </font><font size="-1"><b>47 </b> </font></td>
+<td align="CENTER"><font size="-1"> 372 </font></td>
+</tr>
+</tbody></table>
+
+</div>
+<br>
+<a name="table-method-performance"></a>
+<font size="-1">
+<i>Table 1: <b>EDP Performance.</b> The table shows
+the EDP runtime for different subsystems. "Single" runtime
+represents the time to analyze each subsystem in isolation without
+interaction with other subsystems (e.g., VFS and MM).
+"Full" runtime represents the time to analyze a file system along
+with the virtual file system and the memory management. The last row
+reports the time to analyze all of the file systems together. </i></font>
+<br>
+
+<br>
+
+<p>
+Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-method-performance">1</a> reports the EDP runtime for
+different subsystems, running on a machine with 2.4 GHz Intel Pentium
+4 CPU and 512 MB of memory. Overall, EDP analysis is fast; analyzing
+all file systems together in a single run only takes 47 seconds. We
+now describe the three components of EDP in more detail.
+
+</p><p>
+
+</p><h3><a name="SECTION00032100000000000000"><br>
+2.2.1 Error Code Information</a>
+</h3>
+
+<p>
+The first component of EDP identifies the error codes to track. One
+example is <tt><font size="-1">EIO</font></tt>, a generic error code that commonly indicates I/O
+failure and is used extensively throughout the file system; for
+example, in ext3, <tt><font size="-1">EIO</font></tt> touches 266 functions and propagates through
+467 calls. Besides <tt><font size="-1">EIO</font></tt>, many kernel subsystems commonly use other
+error codes as defined in <tt><font size="-1">include/asm-generic/errno.h</font></tt>. In total,
+there are hundreds of error codes that are used for different
+purposes. We report our findings on the propagation of 34 basic error
+codes that are mostly used across all file systems and storage device
+drivers. These error codes can be found in
+<tt><font size="-1">include/asm-generic/errno-base.h</font></tt>.
+
+</p><p>
+
+</p><h3><a name="SECTION00032200000000000000"><br>
+2.2.2 Channel Construction</a>
+</h3>
+
+<p>
+The second component of EDP constructs the <em>channel</em> in which the
+specified error codes propagate. A channel can be constructed from
+function calls and asynchronous wake-up paths; in our current
+analysis, we focus only on function calls and discuss asynchronous
+paths in Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-future-channel">5.3</a>.
+
+</p><p>
+We define a channel by its two endpoints: generation and termination.
+The <em>generation endpoint</em> is the function that exposes an error
+code, either directly through a return value (<i>e.g.</i>, the function
+contains a <tt><font size="-1">return</font></tt> <tt><font size="-1">-EIO</font></tt> statement) or indirectly through a
+function argument passed by reference. After finding all generation
+endpoints, EDP marks each function that propagates the error codes;
+<em>propagating functions</em> receive error codes from the functions
+that they call and then simply propagate them in a return value or
+function parameter. The <em>termination endpoint</em> is the function in
+which an error code is no longer propagated in the return value or a
+parameter of the function.
+
+</p><p>
+One of the major challenges we address when constructing error
+channels is handling function pointers. The typical approach for
+handling function pointers is to implement a points-to
+analysis&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#Hind01-PointerAnalysis">13</a>] that identifies the set of real
+functions each function pointer might point at; however,
+field-sensitive points-to analyses can be expensive. Therefore, we
+customize our points-to analysis to exploit the systematic structure
+that these pointers exhibit.
+
+</p><p>
+First, we keep track of all structures that have function pointers. For
+example, the VFS read and write interfaces are defined as fields in
+the <tt><font size="-1">file_ops</font></tt> structure:
+
+</p><p>
+</p><pre> struct file_ops {
+ int (*read) ();
+ int (*write) ();
+ };
+</pre>
+
+<p>
+Since each file system needs to define its own <tt><font size="-1">file_ops</font></tt>, we
+automatically find all global instances of such structures, look for
+the function pointer assignments within the instances, and map
+function-pointer implementations to the function pointer interfaces.
+For example, ext2 and ext3 define their file operations like this:
+
+</p><p>
+</p><pre> struct file_ops ext2_f_ops {
+ .read = ext2_read;
+ .write = ext2_write;
+ };
+ struct file_ops ext3_f_ops {
+ .read = ext3_read;
+ .write = ext3_write;
+ };
+</pre>
+
+<p>
+Given such global structure instances, we add the interface
+implementations (<i>e.g.</i>, <tt><font size="-1">ext2_read</font></tt>) to the implementation list of
+the corresponding interfaces (<i>e.g.</i>,
+<tt><font size="-1">file_ops</font></tt>4#4<tt><font size="-1">read</font></tt>). Although this technique
+connects most of the mappings, a function pointer assignment could
+still occur in an instruction rather than in a global structure
+instance. Thus, our tool also visits all functions and finds any
+assignment that maps an implementation to an interface. For example,
+if we find an assignment such as <tt><font size="-1">f_op-&gt;read</font></tt> <tt><font size="-1">=</font></tt>
+<tt><font size="-1">ntfs_read</font></tt>, then we add <tt><font size="-1">ntfs_read</font></tt> to the list of
+<tt><font size="-1">file_ops</font></tt>4#4<tt><font size="-1">read</font></tt> implementations.
+
+</p><p>
+In the last phase, we change function pointer calls to direct
+calls. For example, if VFS makes an interface call such as
+<tt><font size="-1">(f_op-&gt;read)()</font></tt>, then we automatically rewrite such
+an assignment to:
+
+</p><p>
+</p><pre> switch (...) {
+ case ext2: ext2_read(); break;
+ case ext3: ext3_read(); break;
+ case ntfs: ntfs_read(); break;
+ ...
+ }
+</pre>
+
+<p>
+Across all Linux file systems and storage device drivers, there are
+191 structural interfaces (<i>e.g.</i>, <tt><font size="-1">file_ops</font></tt>), 904 function pointer
+fields (<i>e.g.</i>, <tt><font size="-1">read</font></tt>), 5039 implementations (<i>e.g.</i>, <tt><font size="-1">ext2_read</font></tt>),
+and 2685 function pointer calls (<i>e.g.</i>, <tt><font size="-1">(f_op-&gt;read)()</font></tt>). Out of
+2865 function pointer calls, we connect all except 564 calls (20%).
+The unconnected 20% of calls are due to indirect implementation
+assignment. For example, we cannot map assignment such as
+<tt><font size="-1">f_op-&gt;read</font></tt> <tt><font size="-1">=</font></tt> <tt><font size="-1">f</font></tt>, where <tt><font size="-1">f</font></tt> is either a local
+variable or a function parameter, and not a function name. While it
+is feasible to traceback such assignments using stronger and more
+expensive analysis, we assume that major interfaces linking modules
+together have already been connected as part of global instances. If
+all calls are connected, more error propagation chain can be analyzed,
+which means more violations are likely to be found.
+
+
+</p><h3><a name="SECTION00032300000000000000"><br>
+2.2.3 Channel Analysis</a>
+</h3>
+
+<p>
+The third component of EDP distinguishes two kinds of channels:
+error-complete and error-broken channels. An <em>error-complete</em>
+channel is a channel that minimally checks the occurrence of an
+error. An error-complete channel thus has this property at its
+termination endpoint:
+
+</p><p><i>
+&nbsp;&nbsp;&nbsp;&nbsp; &#8707; if (expr) { ... }, where <br>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; errorCodeVariable &#8838; expr
+</i>
+
+
+
+</p><p>
+which states that an error code is considered checked if there exist
+an <tt><font size="-1">if</font></tt> condition whose expression contains the variable that
+stores the error code. For example, the function
+<tt><font size="-1">goodTerminationEndpoint</font></tt> in the code segment below carries an
+error-complete channel because the function saves the returned error
+code (line 2) and checks the error code (line 3):
+
+</p><p>
+</p><pre> 1 void goodTerminationEndpoint() {
+ 2 int err = generationEndpoint();
+ 3 if (err)
+ 4 ...
+ 5 }
+ 6 int generationEndpoint() {
+ 7 return -EIO;
+ 8 }
+</pre>
+
+<p>
+Note that an error could be checked but not handled properly, <i>e.g.</i>&nbsp;no
+error handling in the <tt><font size="-1">if</font></tt> condition. Since error handling is
+usually specific to each file system, and hence there are many
+instances of it, we decided to be "generous" in the way we define
+how error is handled, <i>i.e.</i>&nbsp;by just checking it. More violations
+might be found when we incorporate all instances of error
+handling.
+
+</p><p>
+An <em>error-broken</em> channel is the inverse of an error-complete
+channel. In particular, the error code is either <em>unsaved</em>, <em>unchecked</em>, or <em>overwritten</em>. For example, the function
+<tt><font size="-1">badTerminationEndpoint</font></tt> below carries an error-broken channel of
+unchecked type because the function saves the returned error code
+(line 2) but it never checks the error before the function exits
+(line 3):
+
+</p><p>
+</p><pre> 1 void badTerminationEndpoint() {
+ 2 int err = generationEndpoint();
+ 3 return;
+ 4 }
+</pre>
+
+<p>
+An error-broken channel is a serious file system bug because it can
+lead to a silent failure. In a few cases, we inject faults in
+error-broken channels to confirm the existence of silent failures. We
+utilize our block-level fault injection
+technique&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#PrabhakaranEtAl05-SOSP">20</a>] to exercise error-broken
+channels that relate to disk I/O. In a broken channel, we look for
+two pieces of information: which workload and which failure led us to
+that channel. After finding the necessary information, we run the
+workload, inject the specific block failure, and observe the I/O
+traces and the returned error codes received in upper layers (<i>e.g.</i>, the
+application layer) to confirm whether a broken channel leads to a
+silent failure. The reader will note that our fault-injection
+technique is limited to disk I/O related channels. To exercise all
+error-broken channels, techniques such as symbolic execution and
+directed
+testing&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#EnglerDunbar07-UnderConstrained">9</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#GodefroidEtAl05-DART">10</a>]
+that simulate the environment of the component in test would be of
+great utility.
+
+</p><p>
+
+</p><h3><a name="SECTION00032400000000000000"><br>
+2.2.4 Limitations</a>
+</h3>
+
+<p>
+Error propagation has complex characteristics: correct error codes
+must be returned; each subsystem uses both generic and specific error
+codes; one error code could be mapped to another; error codes are
+stored not only in scalar variables but also in structures (<i>e.g.</i>,
+control blocks); and error codes flow not only through function calls
+but also asynchronously via interrupts and callbacks.
+In our static analysis, we have not modeled all these characteristics.
+Nevertheless, by just focusing on the propagation of basic error codes
+via function call, we have found numerous violations that need to be
+fixed. A more complete tool that covers the properties above would
+uncover even more incorrect error handling.
+
+
+</p><h1><a name="SECTION00040000000000000000"></a>
+<a name="sec-result"></a><br>
+3 Results
+</h1>
+
+<p>
+We have performed EDP analysis on all file systems and storage device
+drivers in Linux 2.6.15.4. Our analysis studies how 34 basic error
+codes (<i>e.g.</i>, <tt><font size="-1">EIO</font></tt> and <tt><font size="-1">ENOMEM</font></tt>) defined in
+<tt><font size="-1">include/asm-generic/errno-base.h</font></tt> propagate through these
+subsystems. We examine these basic error codes because they involve
+thousands of functions and propagate across thousands of calls.
+
+</p><p>
+In these results, we distinguish two types of violations that make up
+an error-broken channel: unsaved and unchecked error codes
+(overwritten codes have been deferred to future work; see
+Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-future-overwritten">5.1</a> for more information).
+An <em>unsaved error code</em> is found when a callee propagates an error
+code via the return value, but the caller does not save the return
+value (<i>i.e.</i>, it is treated as a void-returning call even though it
+actually returns an error code). Throughout the paper, we refer to
+this type of broken channel as a "<em>bad call</em>." An <em>unchecked
+error code</em> is found when a variable that may contain an error code is
+neither checked nor used in the future; we always refer to this case
+as an unchecked code.
+
+</p><p>
+
+</p><h2><a name="SECTION00041000000000000000"></a>
+<a name="sec-result-unsaved"></a>
+3.1 Unsaved Error Codes
+</h2>
+
+<p>
+First, we report the number of error-broken channels due to a caller
+simply not saving the returned error code (<i>i.e.</i>, the number of bad
+calls). The simplified HFS code below shows an example of unsaved
+error code. The function <tt><font size="-1">find_init</font></tt> accepts a new uninitialized
+<tt><font size="-1">find_data</font></tt> structure (line 2), allocates a memory space for the
+<tt><font size="-1">search_key</font></tt> field (line 3), and returns <tt><font size="-1">ENOMEM</font></tt> error code
+when the memory allocation fails (line 5). However, one of its
+callers, <tt><font size="-1">file_lookup</font></tt>, does not save the returned error code
+(line 10) but tries to access the <tt><font size="-1">search_key</font></tt> field which still
+points to <tt><font size="-1">NULL</font></tt> (line 11). Hence, a null-pointer dereference
+takes place and the system could crash or corrupt data.
+
+</p><p>
+</p><pre> 1 // hfs/bfind.c
+ 2 int find_init(find_data *fd) {
+ 3 fd-&gt;search_key = kmalloc(..)
+ 4 if (!fd-&gt;search_key)
+ 5 return -ENOMEM;
+ 6 ...
+ 7 }
+ 8 // hfs/inode.c
+ 9 int file_lookup() {
+ 10 find_init(fd); /* NOT-SAVED E.C */
+ 11 fd-&gt;search_key-&gt;cat = ...; /* BAD!! */
+ 12 ...
+ 13 }
+</pre>
+
+<p>
+To show how EDP is useful in finding error propagation bugs, we begin
+by showing a sample of EDP analysis for a simple file system, Apple
+HFS. Then, we present our findings on all subsystems that we analyze,
+and finally discuss false positives.
+
+</p><p>
+
+<!-- ------------------------------------- HFS -->
+
+</p><h3><a name="SECTION00041100000000000000"><br>
+3.1.1 EDP on Apple HFS</a>
+</h3>
+
+<p>
+Figure&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-big">2</a> depicts the EDP output when analyzing the
+propagation of the 34 basic error codes in the Apple HFS file system.
+There are two important elements that EDP produces in order to ease
+the debugging process. First, EDP generates an error propagation graph
+that only includes functions and function calls through which the
+analyzed error codes propagate. From the graph, one can easily catch
+all bad calls and functions that make the bad calls. Second, EDP
+provides a table that presents more detailed information for each bad
+call (<i>e.g.</i>, the location where the bad call is made).
+
+
+</p><p><a name="fig-result-big"></a></p>
+
+<!-- table-->
+<table border="0" cellspacing="0" cellpadding="0" align="center">
+<tbody><tr><td>
+
+<!-- violation -->
+<div align="CENTER">
+</div><div align="CENTER"><div align="CENTER">
+</div><table width="323">
+<tbody><tr><td>
+ <table cellpadding="3" border="1" align="CENTER">
+<tbody><tr><td align="RIGHT" colspan="1"><font size="-1">
+ </font><font size="-1"><b>Viol#</b></font></td>
+<td align="CENTER" colspan="2"><font size="-1">
+ </font><font size="-1"><b>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Caller&nbsp;&nbsp;&#8594;&nbsp;&nbsp;Callee</b></font></td>
+<td align="LEFT" colspan="1"><font size="-1">
+ </font><font size="-1"><b>Filename</b></font></td>
+<td align="RIGHT" colspan="1"><font size="-1">
+ </font><font size="-1"><b>Line#</b></font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+
+</font><font size="-1"><b>A</b> </font></td>
+<td align="RIGHT"><font size="-1"> file_lookup </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> inode.c </font></td>
+<td align="RIGHT"><font size="-1"> 493 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>B</b> </font></td>
+<td align="RIGHT"><font size="-1"> fill_super </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> super.c </font></td>
+<td align="RIGHT"><font size="-1"> 385 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>C</b> </font></td>
+<td align="RIGHT"><font size="-1"> lookup </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> dir.c </font></td>
+<td align="RIGHT"><font size="-1"> 30 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>D</b> </font></td>
+<td align="RIGHT"><font size="-1"> brec_updt_prnt </font></td>
+<td align="LEFT"><font size="-1"> __brec_find </font></td>
+<td align="LEFT"><font size="-1"> brec.c </font></td>
+<td align="RIGHT"><font size="-1"> 405 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>E</b> </font></td>
+<td align="RIGHT"><font size="-1"> brec_updt_prnt </font></td>
+<td align="LEFT"><font size="-1"> __brec_find </font></td>
+<td align="LEFT"><font size="-1"> brec.c </font></td>
+<td align="RIGHT"><font size="-1"> 345 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>F</b> </font></td>
+<td align="RIGHT"><font size="-1"> cat_delete </font></td>
+<td align="LEFT"><font size="-1"> free_fork </font></td>
+<td align="LEFT"><font size="-1"> catalog.c </font></td>
+<td align="RIGHT"><font size="-1"> 228 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>G</b> </font></td>
+<td align="RIGHT"><font size="-1"> cat_delete </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> catalog.c </font></td>
+<td align="RIGHT"><font size="-1"> 213 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>H</b> </font></td>
+<td align="RIGHT"><font size="-1"> cat_create </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> catalog.c </font></td>
+<td align="RIGHT"><font size="-1"> 95 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>I</b> </font></td>
+<td align="RIGHT"><font size="-1"> file_trunc </font></td>
+<td align="LEFT"><font size="-1"> free_exts </font></td>
+<td align="LEFT"><font size="-1"> extent.c </font></td>
+<td align="RIGHT"><font size="-1"> 507 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>J</b> </font></td>
+<td align="RIGHT"><font size="-1"> file_trunc </font></td>
+<td align="LEFT"><font size="-1"> free_exts </font></td>
+<td align="LEFT"><font size="-1"> extent.c </font></td>
+<td align="RIGHT"><font size="-1"> 497 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>K</b> </font></td>
+<td align="RIGHT"><font size="-1"> file_trunc </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> extent.c </font></td>
+<td align="RIGHT"><font size="-1"> 494 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>L</b> </font></td>
+<td align="RIGHT"><font size="-1"> ext_write_ext </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> extent.c </font></td>
+<td align="RIGHT"><font size="-1"> 135 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>M</b> </font></td>
+<td align="RIGHT"><font size="-1"> ext_read_ext </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> extent.c </font></td>
+<td align="RIGHT"><font size="-1"> 188 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>N</b> </font></td>
+<td align="RIGHT"><font size="-1"> brec_rmv </font></td>
+<td align="LEFT"><font size="-1"> __brec_find </font></td>
+<td align="LEFT"><font size="-1"> brec.c </font></td>
+<td align="RIGHT"><font size="-1"> 193 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>O</b> </font></td>
+<td align="RIGHT"><font size="-1"> readdir </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> dir.c </font></td>
+<td align="RIGHT"><font size="-1"> 68 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>P</b> </font></td>
+<td align="RIGHT"><font size="-1"> cat_move </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> catalog.c </font></td>
+<td align="RIGHT"><font size="-1"> 280 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>Q</b> </font></td>
+<td align="RIGHT"><font size="-1"> brec_insert </font></td>
+<td align="LEFT"><font size="-1"> __brec_find </font></td>
+<td align="LEFT"><font size="-1"> brec.c </font></td>
+<td align="RIGHT"><font size="-1"> 145 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>R</b> </font></td>
+<td align="RIGHT"><font size="-1"> free_fork </font></td>
+<td align="LEFT"><font size="-1"> free_exts </font></td>
+<td align="LEFT"><font size="-1"> extent.c </font></td>
+<td align="RIGHT"><font size="-1"> 307 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+</font><font size="-1"><b>S</b> </font></td>
+<td align="RIGHT"><font size="-1"> free_fork </font></td>
+<td align="LEFT"><font size="-1"> find_init </font></td>
+<td align="LEFT"><font size="-1"> extent.c </font></td>
+<td align="RIGHT"><font size="-1"> 301 </font></td>
+</tr>
+</tbody></table>
+ </td></tr>
+</tbody></table>
+
+
+</div></td>
+<td>
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-big-legend.gif">
+</td></tr>
+<tr>
+<td colspan="2">
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-big.gif">
+</td></tr>
+</tbody></table>
+
+<br>
+<font size="-1"><i>
+Figure 2: <b>A Sample of EDP Output.</b> The lower figure
+depicts the EDP output for the HFS file system. Some function names
+have been shortened to improve readability. As summarized in the
+upper right legend, a gray node with a thicker border represents a
+function that generates an error code. The other gray node represents
+the same thing, but the function also propagates the error code
+received from its callee. A white node represents a good function,
+i.e. it either propagates the error code to its caller or if it does
+not propagate the error code it minimally checks the error code. A
+black node represents an error-broken termination endpoint, i.e. it is
+a function that commits the violation of unsaved error codes. The
+darker and thicker edge coming out from a black node implies a broken
+error channel (a bad call); an error code actually flows from its
+callee, but the caller drops the error code. For ease of debugging,
+each bad call is labeled with a violation number whose detailed
+information can be found in the upper left violation table. For
+example, violation #E found in the bottom left corner of the graph is
+a bad call made by <tt>brec_updt_prnt</tt> when calling <tt>__brec_find</tt>,
+which can be located in <tt>fs/hfs/brec.c</tt> line
+345.
+</i></font>
+<br>
+
+
+
+<p>
+Using the information that EDP provides, we found three major
+error-handling inconsistencies in HFS. First, 11 out of 14 calls to
+<tt><font size="-1">find_init</font></tt> drop the returned error codes. As described earlier in
+this section, this bug could cause the system to crash or corrupt
+data. Second, 4 out of 5 total calls to the function
+<tt><font size="-1">__brec_find</font></tt> are bad calls (as indicated by the four black
+edges, E, D, N, and Q, found in the lower left of the graph). The
+task of this function is to find a record in an HFS node that best
+matches the given key, and return <tt><font size="-1">ENOENT</font></tt> (no entry) error code if
+it fails. The only call that saves this error code is made by the
+wrapper, <tt><font size="-1">brec_find</font></tt>. Interestingly, all 18 calls to this wrapper
+propagate the error code properly (as indicated by all gray edges
+coming into the function).
+
+</p><p>
+Finally, 3 out of 4 calls to <tt><font size="-1">free_exts</font></tt> do not save the returned
+error code (labeled R, I, and J). This function traverses a list of
+extents and locates the extents to be freed. If the extents cannot be
+found, the function returns <tt><font size="-1">EIO</font></tt>. More interestingly, the
+developer wrote a comment "panic?" just before the return statement
+(maybe in the hope that in this failure case the callers will call
+panic, which will never happen if the error code is dropped). By and
+large, we found similar inconsistencies in all the subsystems we
+analyzed. The fact that the fraction of bad calls over all calls to a
+function is generally high is intriguing, and will be discussed
+further in Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-analysis-inconsistent">4.3</a>.
+
+
+
+
+<!-- ------------------------------------- all -->
+
+</p><h3><a name="SECTION00041200000000000000"><br>
+3.1.2 EDP on All File Systems and Storage Drivers</a>
+</h3>
+
+<p>
+Figure&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-small-1">3</a> and&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-small-2">4</a> show EDP
+outputs for six more file systems whose error-propagation graphs
+represent an interesting sample. EDP outputs for the rest of the file
+systems can be downloaded from our web site&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#EdpOutput">11</a>].
+A small file system such as HFS+ has simple propagation chains, yet
+bad calls are still made. More complex error propagation can be seen
+in ext3, ReiserFS, and IBM JFS; within these file systems, error-codes
+propagate throughout 180 to 340 function calls. The error propagation
+in NFS is more structured compared to other file systems. Finally,
+among all file systems we analyze, XFS has the most complex error
+propagation chain; almost 1500 function calls propagate error-codes.
+Note that each graph in Figures&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-small-1">3</a>
+and&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-small-2">4</a> was produced by analyzing each file
+system in isolation (<i>i.e.</i>, the graph only shows intra-module but not
+inter-module calls), yet they already illustrate the complexity of
+error code propagation in each file system. Manual code inspection
+would require a tremendous amount of work to find error-propagation
+bugs.
+
+
+
+
+
+</p><p>
+
+</p><p>
+
+</p><div align="CENTER">
+
+<p><a name="fig-result-small-1"></a></p><div align="CENTER">
+<font size="+1"><b>HFS+</b></font>&nbsp;&nbsp;&nbsp;[ 22 bad / 84 calls, 26%] </div>
+ <br>
+<br>
+ <div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-small-hfsplus.gif">
+</div>
+ <br>
+
+ <br>
+<br>
+ <div align="CENTER">
+<font size="+1"><b>ext3</b></font>&nbsp;&nbsp;&nbsp;[ 37 bad / 188 calls, 20%] </div>
+ <br>
+<br>
+ <div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-small-ext3.gif">
+</div>
+ <br>
+</div>
+
+ <br>
+<br>
+ <div align="CENTER">
+<font size="+1"><b>ReiserFS</b></font>&nbsp;&nbsp;&nbsp;[ 35 bad / 218 calls, 16% ] </div>
+ <br>
+<br>
+ <div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-small-reiserfs.gif"></div>
+ <br>
+<br> <br>
+<font size="-1"><i>
+Figure 3: <b>More Samples of EDP Output.</b>
+The figures illustrate the prevalent problem of incomplete
+error-propagation across different types of file systems. Details such
+as function names and violation numbers have been removed. Gray edges
+represent calls that propagate error codes. Black edges represent bad
+calls. The number of edges are reported in [ X / Y , Z% ] format where X and
+Y represent the number of black and all (gray and black) edges
+respectively, and Z represents the fraction of X and Y. For more
+information, please see the legend in Figure 2. </i></font>
+<br>
+
+
+
+<p>
+
+</p><p>
+
+</p><div align="CENTER">
+
+<p><a name="fig-result-small-2"></a></p><div align="CENTER">
+<font size="+1"><b>IBM JFS</b></font>&nbsp;&nbsp;&nbsp;[ 61 bad / 340 calls, 18% ]</div>
+
+ <br>
+<br>
+ <div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-small-jfs.gif"></div>
+ <br>
+
+ <br>
+<br>
+ <div align="CENTER">
+<font size="+1"><b>NFS Client</b></font>&nbsp;&nbsp;&nbsp;[ 54 bad / 446 calls, 12% ]</div>
+
+ <br>
+<br>
+ <div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-small-nfs.gif"></div>
+ <br>
+
+ <br>
+<br>
+ <div align="CENTER">
+<font size="+1"><b>XFS</b></font>&nbsp;&nbsp;&nbsp;[ 105 bad / 1453 calls, 7% ]</div>
+
+ <br>
+<br>
+ <div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-small-xfs.gif"></div>
+ <br>
+<br>
+<br>
+<font size="-1"><i>
+Figure 4: <b>More Samples of EDP Output (Cont'd).</b>
+Please see caption in Figure 3.</i></font>
+<br>
+
+</div>
+
+
+
+<!-- table all -->
+<p>
+Next, we analyzed the propagation of error codes across all file
+systems and storage device drivers as a whole. All inter-module calls
+were connected by our EDP channel constructor, which connects all
+function pointer calls; hence, we were able to catch inter-module bad
+calls in addition to intra-module ones. Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-result-all">2</a>
+summarizes our findings. Note that the number of violations reported
+is higher than the ones reported in
+Figures&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-big">2</a>,&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-small-1">3</a>,
+and&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-small-2">4</a> because we catch more bugs when we
+analyze each file system in conjunction with other subsystems (<i>e.g.</i>,
+ext3 with the journaling layer, VFS, and the memory management).
+
+</p><p>
+Surprisingly, out of 9022 error channels, 1153 (or nearly 13%)
+constitute bad calls. This appears to be a long-standing problem. We
+ran a partial analysis in Linux 2.4 (not shown) and found that the
+magnitude of incomplete error code propagation is essentially the
+same. In Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-analysis">4</a>, we try to dissect the root
+causes of this problem.
+
+
+
+<a name="table-result-all"></a>
+<table border="0" align="center" cellspacing="50" cellpadding="0">
+<tbody><tr><td>
+<p>
+<table cellpadding="3" cellspacing="0" border="1" align="center">
+<tbody><tr><td colspan="6" align="center" bgcolor="#FFFFCC">
+ <font size="+1"><b>File Systems</b></font></td>
+</tr>
+<tr bgcolor="#FFFFCC">
+ <td aligh="center"><font size="-1" color="white">.</font></td>
+ <td aligh="center"><b><font size="-1">Bad Calls</font></b></td>
+ <td aligh="center"><b><font size="-1">EC Calls</font></b></td>
+ <td aligh="center"><b><font size="-1">Size (kloc)</font></b></td>
+ <td aligh="center"><b><font size="-1">Frac (%)</font></b></td>
+ <td aligh="center"><b><font size="-1">Viol/kloc</font></b></td>
+</tr>
+<tr><td><font size="-1">XFS </font></td> <td align="right"><font size="-1"><b> 101 </b></font></td><td align="right"><font size="-1"> 1457 </font></td><td align="right"><font size="-1"> 71 </font></td><td align="right"><font size="-1"> 6.9 </font></td><td align="right"><font size="-1"> 1.4 </font></td></tr> <!-- fs/xfs/ -->
+<tr><td><font size="-1">Virtual FS </font></td> <td align="right"><font size="-1"><b> 96 </b></font></td><td align="right"><font size="-1"> 1149 </font></td><td align="right"><font size="-1"> 34 </font></td><td align="right"><font size="-1"> 8.4 </font></td><td align="right"><font size="-1"> 2.9 </font></td></tr> <!-- fs/vfs/ -->
+<tr><td><font size="-1">IBM JFS </font></td> <td align="right"><font size="-1"><b> 95 </b></font></td><td align="right"><font size="-1"> 390 </font></td><td align="right"><font size="-1"> 17 </font></td><td align="right"><font size="-1"> 24.4 </font></td><td align="right"><font size="-1"> 5.6 </font></td></tr> <!-- fs/jfs/ -->
+<tr><td><font size="-1">ext3 </font></td> <td align="right"><font size="-1"><b> 80 </b></font></td><td align="right"><font size="-1"> 362 </font></td><td align="right"><font size="-1"> 12 </font></td><td align="right"><font size="-1"> 22.1 </font></td><td align="right"><font size="-1"> 7.2 </font></td></tr> <!-- fs/ext3/ -->
+<tr><td><font size="-1">NFS Client </font></td> <td align="right"><font size="-1"><b> 62 </b></font></td><td align="right"><font size="-1"> 482 </font></td><td align="right"><font size="-1"> 18 </font></td><td align="right"><font size="-1"> 12.9 </font></td><td align="right"><font size="-1"> 3.6 </font></td></tr> <!-- fs/nfs/ -->
+<tr><td><font size="-1">CIFS </font></td> <td align="right"><font size="-1"><b> 43 </b></font></td><td align="right"><font size="-1"> 339 </font></td><td align="right"><font size="-1"> 21 </font></td><td align="right"><font size="-1"> 12.7 </font></td><td align="right"><font size="-1"> 2.1 </font></td></tr> <!-- fs/cifs/ -->
+<tr><td><font size="-1">ReiserFS </font></td> <td align="right"><font size="-1"><b> 42 </b></font></td><td align="right"><font size="-1"> 399 </font></td><td align="right"><font size="-1"> 24 </font></td><td align="right"><font size="-1"> 10.5 </font></td><td align="right"><font size="-1"> 1.8 </font></td></tr> <!-- fs/reiserfs/ -->
+<tr><td><font size="-1">Mem. Mgmt. </font></td> <td align="right"><font size="-1"><b> 40 </b></font></td><td align="right"><font size="-1"> 351 </font></td><td align="right"><font size="-1"> 20 </font></td><td align="right"><font size="-1"> 11.4 </font></td><td align="right"><font size="-1"> 2.0 </font></td></tr> <!-- mm/ -->
+<tr><td><font size="-1">Apple HFS+ </font></td> <td align="right"><font size="-1"><b> 25 </b></font></td><td align="right"><font size="-1"> 98 </font></td><td align="right"><font size="-1"> 7 </font></td><td align="right"><font size="-1"> 25.5 </font></td><td align="right"><font size="-1"> 3.7 </font></td></tr> <!-- fs/hfsplus/ -->
+<tr><td><font size="-1">JFFS v2 </font></td> <td align="right"><font size="-1"><b> 24 </b></font></td><td align="right"><font size="-1"> 153 </font></td><td align="right"><font size="-1"> 11 </font></td><td align="right"><font size="-1"> 15.7 </font></td><td align="right"><font size="-1"> 2.2 </font></td></tr> <!-- fs/jffs2/ --> <!-- break drivers -->
+<tr><td><font size="-1">Apple HFS </font></td> <td align="right"><font size="-1"><b> 20 </b></font></td><td align="right"><font size="-1"> 76 </font></td><td align="right"><font size="-1"> 5 </font></td><td align="right"><font size="-1"> 26.3 </font></td><td align="right"><font size="-1"> 4.8 </font></td></tr> <!-- fs/hfs/ -->
+<tr><td><font size="-1">SMB </font></td> <td align="right"><font size="-1"><b> 19 </b></font></td><td align="right"><font size="-1"> 196 </font></td><td align="right"><font size="-1"> 6 </font></td><td align="right"><font size="-1"> 9.7 </font></td><td align="right"><font size="-1"> 3.5 </font></td></tr> <!-- fs/smbfs/ -->
+<tr><td><font size="-1">ext2 </font></td> <td align="right"><font size="-1"><b> 18 </b></font></td><td align="right"><font size="-1"> 103 </font></td><td align="right"><font size="-1"> 6 </font></td><td align="right"><font size="-1"> 17.5 </font></td><td align="right"><font size="-1"> 3.3 </font></td></tr> <!-- fs/ext2/ -->
+<tr><td><font size="-1">AFS </font></td> <td align="right"><font size="-1"><b> 16 </b></font></td><td align="right"><font size="-1"> 62 </font></td><td align="right"><font size="-1"> 7 </font></td><td align="right"><font size="-1"> 25.8 </font></td><td align="right"><font size="-1"> 2.6 </font></td></tr> <!-- fs/afs/ -->
+<tr><td><font size="-1">NTFS </font></td> <td align="right"><font size="-1"><b> 15 </b></font></td><td align="right"><font size="-1"> 186 </font></td><td align="right"><font size="-1"> 18 </font></td><td align="right"><font size="-1"> 8.1 </font></td><td align="right"><font size="-1"> 0.9 </font></td></tr> <!-- fs/ntfs/ -->
+<tr><td><font size="-1">NFS Server </font></td> <td align="right"><font size="-1"><b> 15 </b></font></td><td align="right"><font size="-1"> 265 </font></td><td align="right"><font size="-1"> 14 </font></td><td align="right"><font size="-1"> 5.7 </font></td><td align="right"><font size="-1"> 1.2 </font></td></tr> <!-- fs/nfsd/ -->
+<tr><td><font size="-1">NCP </font></td> <td align="right"><font size="-1"><b> 13 </b></font></td><td align="right"><font size="-1"> 169 </font></td><td align="right"><font size="-1"> 5 </font></td><td align="right"><font size="-1"> 7.7 </font></td><td align="right"><font size="-1"> 2.6 </font></td></tr> <!-- fs/ncpfs/ -->
+<tr><td><font size="-1">UFS </font></td> <td align="right"><font size="-1"><b> 12 </b></font></td><td align="right"><font size="-1"> 44 </font></td><td align="right"><font size="-1"> 5 </font></td><td align="right"><font size="-1"> 27.3 </font></td><td align="right"><font size="-1"> 2.6 </font></td></tr> <!-- fs/ufs/ -->
+<tr><td><font size="-1">JBD </font></td> <td align="right"><font size="-1"><b> 10 </b></font></td><td align="right"><font size="-1"> 43 </font></td><td align="right"><font size="-1"> 4 </font></td><td align="right"><font size="-1"> 23.3 </font></td><td align="right"><font size="-1"> 2.6 </font></td></tr> <!-- fs/jbd/ -->
+<tr><td><font size="-1">FAT </font></td> <td align="right"><font size="-1"><b> 9 </b></font></td><td align="right"><font size="-1"> 81 </font></td><td align="right"><font size="-1"> 4 </font></td><td align="right"><font size="-1"> 11.1 </font></td><td align="right"><font size="-1"> 2.9 </font></td></tr> <!-- fs/fat/ -->
+<tr><td><font size="-1">Plan 9 </font></td> <td align="right"><font size="-1"><b> 9 </b></font></td><td align="right"><font size="-1"> 80 </font></td><td align="right"><font size="-1"> 4 </font></td><td align="right"><font size="-1"> 11.2 </font></td><td align="right"><font size="-1"> 2.4 </font></td></tr> <!-- fs/9p/ -->
+<tr><td><font size="-1">System V </font></td> <td align="right"><font size="-1"><b> 7 </b></font></td><td align="right"><font size="-1"> 30 </font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 23.3 </font></td><td align="right"><font size="-1"> 3.2 </font></td></tr> <!-- fs/sysv/ -->
+<tr><td><font size="-1">JFFS </font></td> <td align="right"><font size="-1"><b> 7 </b></font></td><td align="right"><font size="-1"> 56 </font></td><td align="right"><font size="-1"> 5 </font></td><td align="right"><font size="-1"> 12.5 </font></td><td align="right"><font size="-1"> 1.4 </font></td></tr> <!-- fs/jffs/ -->
+<tr><td><font size="-1">UDF </font></td> <td align="right"><font size="-1"><b> 6 </b></font></td><td align="right"><font size="-1"> 50 </font></td><td align="right"><font size="-1"> 9 </font></td><td align="right"><font size="-1"> 12.0 </font></td><td align="right"><font size="-1"> 0.7 </font></td></tr> <!-- fs/udf/ -->
+<tr><td><font size="-1">MSDOS </font></td> <td align="right"><font size="-1"><b> 5 </b></font></td><td align="right"><font size="-1"> 39 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 12.8 </font></td><td align="right"><font size="-1"> 9.3 </font></td></tr> <!-- fs/msdos/ -->
+<tr><td><font size="-1">VFAT </font></td> <td align="right"><font size="-1"><b> 4 </b></font></td><td align="right"><font size="-1"> 39 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 10.3 </font></td><td align="right"><font size="-1"> 5.0 </font></td></tr> <!-- fs/vfat/ -->
+<tr><td><font size="-1">Minix </font></td> <td align="right"><font size="-1"><b> 4 </b></font></td><td align="right"><font size="-1"> 31 </font></td><td align="right"><font size="-1"> 4 </font></td><td align="right"><font size="-1"> 12.9 </font></td><td align="right"><font size="-1"> 1.2 </font></td></tr> <!-- fs/minix/ -->
+<tr><td><font size="-1">FUSE </font></td> <td align="right"><font size="-1"><b> 4 </b></font></td><td align="right"><font size="-1"> 48 </font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 8.3 </font></td><td align="right"><font size="-1"> 1.5 </font></td></tr> <!-- fs/fuse/ --> <!-- break fs -->
+<tr><td><font size="-1">Automounter4 </font></td> <td align="right"><font size="-1"><b> 4 </b></font></td><td align="right"><font size="-1"> 53 </font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 7.5 </font></td><td align="right"><font size="-1"> 2.7 </font></td></tr> <!-- fs/autofs4/ -->
+<tr><td><font size="-1">NFS Lockd </font></td> <td align="right"><font size="-1"><b> 3 </b></font></td><td align="right"><font size="-1"> 21 </font></td><td align="right"><font size="-1"> 4 </font></td><td align="right"><font size="-1"> 14.3 </font></td><td align="right"><font size="-1"> 0.8 </font></td></tr> <!-- fs/lockd/ -->
+<tr><td><font size="-1">Relayfs </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 5 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 40.0 </font></td><td align="right"><font size="-1"> 2.7 </font></td></tr> <!-- fs/relayfs/ -->
+<tr><td><font size="-1">Partitions </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 4 </font></td><td align="right"><font size="-1"> 66.7 </font></td><td align="right"><font size="-1"> 0.6 </font></td></tr> <!-- fs/partitions/ -->
+<tr><td><font size="-1">ISO </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 19 </font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 10.5 </font></td><td align="right"><font size="-1"> 0.7 </font></td></tr> <!-- fs/isofs/ -->
+<tr><td><font size="-1">HugeTLB Sup </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 10 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 20.0 </font></td><td align="right"><font size="-1"> 3.0 </font></td></tr> <!-- fs/hugetlbfs/ -->
+<tr><td><font size="-1">Compr. ROM </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 66.7 </font></td><td align="right"><font size="-1"> 4.5 </font></td></tr> <!-- fs/cramfs/ -->
+<tr><td><font size="-1">ADFS </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 30 </font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 6.7 </font></td><td align="right"><font size="-1"> 1.3 </font></td></tr> <!-- fs/adfs/ -->
+<tr><td><font size="-1">sysfs sup. </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 29 </font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 3.4 </font></td><td align="right"><font size="-1"> 0.8 </font></td></tr> <!-- fs/sysfs/ -->
+<tr><td><font size="-1">romfs sup. </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 33.3 </font></td><td align="right"><font size="-1"> 2.4 </font></td></tr> <!-- fs/romfs/ -->
+<tr><td><font size="-1">ramfs sup. </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 6 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 16.7 </font></td><td align="right"><font size="-1"> 6.0 </font></td></tr> <!-- fs/ramfs/ -->
+<tr><td><font size="-1">QNX 4 </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 8 </font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 12.5 </font></td><td align="right"><font size="-1"> 0.9 </font></td></tr> <!-- fs/qnx4/ -->
+<tr><td><font size="-1">proc fs sup. </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 44 </font></td><td align="right"><font size="-1"> 6 </font></td><td align="right"><font size="-1"> 2.3 </font></td><td align="right"><font size="-1"> 0.2 </font></td></tr> <!-- fs/proc/ -->
+<tr><td><font size="-1">OS/2 HPFS </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 18 </font></td><td align="right"><font size="-1"> 6 </font></td><td align="right"><font size="-1"> 5.6 </font></td><td align="right"><font size="-1"> 0.2 </font></td></tr> <!-- fs/hpfs/ -->
+<tr><td><font size="-1">FreeVxFS </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 4 </font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 25.0 </font></td><td align="right"><font size="-1"> 0.7 </font></td></tr> <!-- fs/freevxfs/ -->
+<tr><td><font size="-1">EFS </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 33.3 </font></td><td align="right"><font size="-1"> 1.4 </font></td></tr> <!-- fs/efs/ -->
+<tr><td><font size="-1">devpts </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 50.0 </font></td><td align="right"><font size="-1"> 6.2 </font></td></tr> <!-- fs/devpts/ -->
+<tr><td><font size="-1">Boot FS </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 9 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 11.1 </font></td><td align="right"><font size="-1"> 1.2 </font></td></tr> <!-- fs/bfs/ -->
+<tr><td><font size="-1">BeOS </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 5 </font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 20.0 </font></td><td align="right"><font size="-1"> 0.5 </font></td></tr> <!-- fs/befs/ -->
+<tr><td><font size="-1">Automounter </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 41 </font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 2.4 </font></td><td align="right"><font size="-1"> 1.0 </font></td></tr> <!-- fs/autofs/ -->
+<tr><td><font size="-1">Amiga FFS </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 34 </font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 2.9 </font></td><td align="right"><font size="-1"> 0.3 </font></td></tr> <!-- fs/affs/ -->
+<tr><td><font size="-1">exportfs sup. </font></td> <td align="right"><font size="-1"><b> 0 </b></font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 1 </font></td><td align="right"><font size="-1"> 0.0 </font></td><td align="right"><font size="-1"> 0.0 </font></td></tr> <!-- fs/exportfs/ -->
+<tr><td><font size="-1">Coda </font></td> <td align="right"><font size="-1"><b> 0 </b></font></td><td align="right"><font size="-1"> 149 </font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 0.0 </font></td><td align="right"><font size="-1"> 0.0 </font></td></tr> <!-- fs/coda/ -->
+<tr><td><font size="-1"><b> Total</b> </font></td> <td align="right"><font size="-1"><b> 0 </b></font></td><td align="right"><font size="-1"> 7278 </font></td><td align="right"><font size="-1"> 366 </font></td><td align="right"><font size="-1"> -- </font></td><td align="right"><font size="-1"> -- </font></td></tr> <!-- TOTAL = 51 -->
+<tr><td><font size="-1"><b> Average</b> </font></td> <td align="right"><font size="-1"><b> 16.3 </b></font></td><td align="right"><font size="-1"> 142.7 </font></td><td align="right"><font size="-1"> 7.2 </font></td><td align="right"><font size="-1"><b> 17.0 </b></font></td><td align="right"><font size="-1"><b> 2.4 </b></font></td></tr> <!-- 51 -->
+</tbody></table>
+</p></td><td valign="top">
+<p>
+<table cellpadding="3" cellspacing="0" border="1" align="center">
+<tbody><tr><td colspan="6" align="center" bgcolor="#FFFFCC">
+ <font size="+1"><b>Storage Drivers</b></font></td>
+</tr>
+<tr bgcolor="#FFFFCC">
+ <td aligh="center"><font size="-1" color="white">.</font></td>
+ <td aligh="center"><b><font size="-1">Bad Calls</font></b></td>
+ <td aligh="center"><b><font size="-1">EC Calls</font></b></td>
+ <td aligh="center"><b><font size="-1">Size (kloc)</font></b></td>
+ <td aligh="center"><b><font size="-1">Frac (%)</font></b></td>
+ <td aligh="center"><b><font size="-1">Viol/kloc</font></b></td>
+</tr>
+<tr><td><font size="-1">SCSI (root) </font></td> <td align="right"><font size="-1"><b> 123 </b></font></td><td align="right"><font size="-1"> 628 </font></td><td align="right"><font size="-1"> 198 </font></td><td align="right"><font size="-1"> 19.6 </font></td><td align="right"><font size="-1"> 0.6 </font></td></tr> <!-- drivers/scsi/root/ -->
+<tr><td><font size="-1">IDE (root) </font></td> <td align="right"><font size="-1"><b> 53 </b></font></td><td align="right"><font size="-1"> 223 </font></td><td align="right"><font size="-1"> 15 </font></td><td align="right"><font size="-1"> 23.8 </font></td><td align="right"><font size="-1"> 3.5 </font></td></tr> <!-- drivers/ide/root/ -->
+<tr><td><font size="-1">Block Dev (root) </font></td> <td align="right"><font size="-1"><b> 39 </b></font></td><td align="right"><font size="-1"> 195 </font></td><td align="right"><font size="-1"> 36 </font></td><td align="right"><font size="-1"> 20.0 </font></td><td align="right"><font size="-1"> 1.1 </font></td></tr> <!-- drivers/block/root2/ -->
+<tr><td><font size="-1">Software RAID </font></td> <td align="right"><font size="-1"><b> 31 </b></font></td><td align="right"><font size="-1"> 290 </font></td><td align="right"><font size="-1"> 32 </font></td><td align="right"><font size="-1"> 10.7 </font></td><td align="right"><font size="-1"> 1.0 </font></td></tr> <!-- drivers/md/ -->
+<tr><td><font size="-1">SCSI (aacraid) </font></td> <td align="right"><font size="-1"><b> 30 </b></font></td><td align="right"><font size="-1"> 76 </font></td><td align="right"><font size="-1"> 7 </font></td><td align="right"><font size="-1"> 39.5 </font></td><td align="right"><font size="-1"> 4.8 </font></td></tr> <!-- drivers/scsi/aacraid/ -->
+<tr><td><font size="-1">SCSI (lpfc) </font></td> <td align="right"><font size="-1"><b> 14 </b></font></td><td align="right"><font size="-1"> 30 </font></td><td align="right"><font size="-1"> 16 </font></td><td align="right"><font size="-1"> 46.7 </font></td><td align="right"><font size="-1"> 0.9 </font></td></tr> <!-- drivers/scsi/lpfc/ -->
+<tr><td><font size="-1">Blk Dev (P-IDE) </font></td> <td align="right"><font size="-1"><b> 11 </b></font></td><td align="right"><font size="-1"> 17 </font></td><td align="right"><font size="-1"> 8 </font></td><td align="right"><font size="-1"> 64.7 </font></td><td align="right"><font size="-1"> 1.5 </font></td></tr> <!-- drivers/block/paride/ -->
+<tr><td><font size="-1">SCSI aic7xxx </font></td> <td align="right"><font size="-1"><b> 8 </b></font></td><td align="right"><font size="-1"> 62 </font></td><td align="right"><font size="-1"> 37 </font></td><td align="right"><font size="-1"> 12.9 </font></td><td align="right"><font size="-1"> 0.2 </font></td></tr> <!-- drivers/scsi/aic7xxx/ -->
+<tr><td><font size="-1">IDE (pci) </font></td> <td align="right"><font size="-1"><b> 5 </b></font></td><td align="right"><font size="-1"> 106 </font></td><td align="right"><font size="-1"> 12 </font></td><td align="right"><font size="-1"> 4.7 </font></td><td align="right"><font size="-1"> 0.4 </font></td></tr> <!-- drivers/ide/pci/ -->
+<tr><td><font size="-1">IDE legacy </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 3 </font></td><td align="right"><font size="-1"> 66.7 </font></td><td align="right"><font size="-1"> 0.8 </font></td></tr> <!-- drivers/ide/legacy/ --> <!-- break drivers -->
+<tr><td><font size="-1">Blk Layer Core </font></td> <td align="right"><font size="-1"><b> 2 </b></font></td><td align="right"><font size="-1"> 65 </font></td><td align="right"><font size="-1"> 8 </font></td><td align="right"><font size="-1"> 3.1 </font></td><td align="right"><font size="-1"> 0.3 </font></td></tr> <!-- block/root1/ -->
+<tr><td><font size="-1">SCSI megaraid </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 30 </font></td><td align="right"><font size="-1"> 6 </font></td><td align="right"><font size="-1"> 3.3 </font></td><td align="right"><font size="-1"> 0.2 </font></td></tr> <!-- drivers/scsi/megaraid/ -->
+<tr><td><font size="-1">Blk Dev (Eth) </font></td> <td align="right"><font size="-1"><b> 1 </b></font></td><td align="right"><font size="-1"> 5 </font></td><td align="right"><font size="-1"> 2 </font></td><td align="right"><font size="-1"> 20.0 </font></td><td align="right"><font size="-1"> 0.7 </font></td></tr> <!-- drivers/block/aoe/ -->
+<tr><td><font size="-1">SCSI (sym53c8) </font></td> <td align="right"><font size="-1"><b> 0 </b></font></td><td align="right"><font size="-1"> 6 </font></td><td align="right"><font size="-1"> 10 </font></td><td align="right"><font size="-1"> 0.0 </font></td><td align="right"><font size="-1"> 0.0 </font></td></tr> <!-- drivers/scsi/sym53c8xx_2/ -->
+<tr><td><font size="-1">SCSI (qla2xxx) </font></td> <td align="right"><font size="-1"><b> 0 </b></font></td><td align="right"><font size="-1"> 8 </font></td><td align="right"><font size="-1"> 49 </font></td><td align="right"><font size="-1"> 0.0 </font></td><td align="right"><font size="-1"> 0.0 </font></td></tr> <!-- drivers/scsi/qla2xxx/ -->
+<tr><td><font size="-1"><b> Total</b> </font></td> <td align="right"><font size="-1"><b> 0 </b></font></td><td align="right"><font size="-1"> 1744 </font></td><td align="right"><font size="-1"> 430 </font></td><td align="right"><font size="-1"> -- </font></td><td align="right"><font size="-1"> -- </font></td></tr> <!-- TOTAL = 15 -->
+<tr><td><font size="-1"><b> Average</b> </font></td> <td align="right"><font size="-1"><b> 21.3 </b></font></td><td align="right"><font size="-1"> 116.3 </font></td><td align="right"><font size="-1"> 28.6 </font></td><td align="right"><font size="-1"><b> 22.4 </b></font></td><td align="right"><font size="-1"><b> 1.1 </b></font></td></tr> <!-- 15 -->
+</tbody></table>
+</p></td></tr>
+</tbody></table>
+<br>
+<font size="-1"><i>
+Table 2: <b>Error-broken channels due to unsaved
+error codes.</b> These tables report the number of bad calls found across
+all file systems and storage device drivers in Linux 2.6.15.4. In
+each table, from left to right column we report the name of
+the subsystem, the number of bad calls, the number of error channels
+(i.e., the number of calls to functions that propagate error codes),
+the size of the subsystem,
+the fraction of bad calls over all error-related calls (ratio of
+2nd and 3rd column), and finally the number of violations
+per Kloc (ratio of 2nd and 4th column).
+We categorize a directory as a subsystem. Thus, for storage
+drivers, since different SCSI device drivers exist in the first-level
+of the <tt>scsi/</tt> directory, we put all of them as one subsystem. SCSI
+device drivers that are located in different directories (e.g.,
+<tt>scsi/lpfc/</tt>, <tt>scsi/aacraid/</tt>) are categorized as different
+subsystems. The same principle is applied to IDE. }
+</i></font>
+
+</p><p>
+
+
+
+</p><p>
+
+</p><h3><a name="SECTION00041300000000000000"><br>
+3.1.3 False Positives</a>
+</h3>
+
+<p>
+It is important to note that while the number of bad calls is high,
+not all bad calls could cause damage to the system. The primary
+reason is what we call a <em>double error code</em>; some functions
+expose two or more error codes at the same time, and checking one of
+the error codes while ignoring the others can still be correct. For
+example, in the ReiserFS code below, the error code returned from
+<tt><font size="-1">sync_dirty_buffer</font></tt> does not have to be saved (line 8) <em>if
+and only if</em> the function performs the check on the second error code
+(line 9); the buffer must be checked whether it is is up-to-date.
+
+</p><p>
+</p><pre> 1 // fs/buffer.c
+ 2 int sync_dirty_buffer (buffer_head* bh) {
+ 3 ...
+ 4 return ret; // RETURN ERROR CODE
+ 5 }
+ 6 // reiserfs/journal.c
+ 7 int flush_commit_list() {
+ 8 sync_dirty_buffer(bh); // UNSAVED EC
+ 9 if (!buffer_uptodate(bh)) {
+ 10 return -EIO;
+ 11 }
+ 12 }
+</pre>
+
+<p>
+To ensure that the number of false positives we report is not overly
+large, we manually analyze all of the code snippets to check whether a
+second error code is being checked. Note that this manual process can
+be automated if we incorporate all types of error codes into EDP. We
+have found only a total of 39 false positives, which have been
+excluded from the numbers we report in this paper. Thus, the high
+numbers in Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-result-all">2</a> provide a hint to a real and
+critical problem.
+
+</p><p>
+
+</p><h2><a name="SECTION00042000000000000000"></a>
+<a name="sec-result-silent"></a><br>
+3.2 Silent Failures: Manifestations of Unsaved Error Codes
+</h2>
+
+<p>
+
+</p><p>
+
+</p><p>
+To show that unsaved error codes represent a serious problem that can
+lead to silent failures, we injected disk block failures in a few
+cases. As shown in Figure&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-result-zoom">5</a>, one serious silent
+failure arises during file system recovery: the journaling block
+device layer (JBD) does not properly propagate any block write
+failures, including inode, directory, bitmap, superblock, and other
+block write failures. EDP unearths these silent failures by
+pinpointing the <tt><font size="-1">journal_recover</font></tt> function, which is responsible
+for file system recovery, as it calls <tt><font size="-1">sync_blockdev</font></tt> to flush the
+dirty buffer pages owned by the block device. Unfortunately,
+<tt><font size="-1">journal_recover</font></tt> does not save the error code propagated by
+<tt><font size="-1">sync_blockdev</font></tt> in the case of block write failures. This is an
+example where the error code is dropped in the middle of its
+propagation chain; <tt><font size="-1">sync_blockdev</font></tt> correctly propagates the <tt><font size="-1">EIO</font></tt> error codes received from the two function calls it makes.
+
+
+
+</p><div align="CENTER">
+
+<p><a name="fig-result-zoom">
+
+<table border="0" cellpadding="0" cellspacing="0">
+<tbody><tr><td>
+<img src="./Error Handling is Ocassionally Correct_files/fig-result-zoom.gif">
+</td><td>
+
+<pre>journal_recover()
+ /* BROKEN CHANNEL */
+ sync_blockdev();
+
+sync_blockdev()
+ ret = fm_fdatawrite();
+ err = fm_fdatawait();
+ if(!ret) ret = err;
+ /* PROPAGATE EIO */
+ return ret;
+</pre>
+</td></tr></tbody></table>
+
+<br>
+<font size="-1"><i>
+Figure 5: <b>Silent error in journal recovery.</b>
+In the figure on the left, EDP marks <tt>journal_recover</tt> as a termination
+endpoint of a broken channel. The code snippet on the right shows that
+<tt>journal_recover</tt> ignores the <tt>EIO</tt> propagated by <tt>sync_blockdev</tt>.
+</i></font>
+
+<br>
+
+</a></p></div><a name="fig-result-zoom">
+
+<p>
+A similar problem occurs in the NFS server code. From a similar
+failure injection experiment, we found that the NFS client is not
+informed when a write failure occurs during a <tt><font size="-1">sync</font></tt> operation. In
+the experiment, the client updates old data and then sends a <tt><font size="-1">sync</font></tt>
+operation with the data to the NFS server. The NFS server then invokes
+the <tt><font size="-1">nfsd_dosync</font></tt> operation, which mainly performs three
+operations similar to the <tt><font size="-1">sync_blockdev</font></tt> call above. First, the
+NFS server writes dirty pages to the disk; second, it writes dirty
+inodes and the superblock to disk; third, it waits until the ongoing
+I/O data transfer terminates. All these three operations could return
+error codes, but the implementation of <tt><font size="-1">nfsd_dosync</font></tt> does not save
+any return values. As a result, the NFS client will never notice any
+disk write failures occurring in the server. Thus, even a careful,
+error-robust client cannot trust the server to inform it of errors
+that occur.
+
+</p></a><p><a name="fig-result-zoom">
+In the NFS server code, we might expect that at least one return value
+would be saved and checked properly. However, no return values are
+saved, leading one to question whether the returned error codes from
+the <tt><font size="-1">write</font></tt> or <tt><font size="-1">sync</font></tt> operations are correctly handled in
+general. It could be the case that the developers are not concerned
+about write failures. We investigate this hypothesis in
+Section&nbsp;</a><a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-analysis-neglected">4.2</a>.
+
+</p><p>
+
+</p><h2><a name="SECTION00043000000000000000"></a>
+<a name="sec-result-unchecked"></a><br>
+3.3 Unchecked Error Code
+</h2>
+
+<p>
+Lastly, we report the number of error-broken channels due to a variable
+that contains an error code not being checked or used in the future.
+For example, in the IBM JFS code below, <tt><font size="-1">rc</font></tt> carries an error code
+propagated from <tt><font size="-1">txCommit</font></tt> (line 4), but <tt><font size="-1">rc</font></tt> is never checked.
+
+</p><p>
+</p><pre> 1 // jfs/jfs_txnmgr.c
+ 2 int jfs_sync () {
+ 3 int rc;
+ 4 rc = txCommit(); // UNCHECKED 'rc'
+ 5 // No usage or check of 'rc'
+ 6 // after this line
+ 7 }
+</pre>
+
+<p>
+This analysis can also report false positives due to the double error
+code problem described previously. In addition, we also find the
+problem of <em>overloaded variables</em> that contribute as false
+positives. We define a variable to be overloaded if the variable could
+contain an error code or a data value. For instance,
+<tt><font size="-1">blknum</font></tt> in the QNX4 code below is an example of an overloaded
+variable:
+
+</p><p>
+</p><pre> 1 // qnx4/dir.c
+ 2 int qnx4_readdir () {
+ 3 int blknum;
+ 4 struct buffer_head *bh;
+ 5 blknum = qnx4_block_map();
+ 6 bh = sb_bread (blknum);
+ 7 if (bh == NULL)
+ 8 // error
+ 9 }
+</pre>
+
+<p>
+In this code, <tt><font size="-1">qnx4_block_map</font></tt> could return an error code (line
+5), which is usually a negative value. <tt><font size="-1">sb_bread</font></tt> takes a block
+number and returns a buffer head that contains the data for that
+particular block (line 6). Since a negative block number will lead to
+a <tt><font size="-1">NULL</font></tt> buffer head (line 7), the error code stored in <tt><font size="-1">blknum</font></tt>
+does not have to be explicitly checked. The developer believes that
+the other part of the code will catch this error or eventually raise
+related errors. This practice reduces the accuracy of our static
+analysis.
+
+</p><p>
+Since the number of unchecked error code reports is small, we were
+able to remove the false positives and find a total of 3 and 2
+unchecked error codes in file systems and storage drivers,
+respectively, that could lead to silent failures.
+
+</p><p>
+
+</p><h1><a name="SECTION00050000000000000000"></a>
+<a name="sec-analysis"></a><br>
+4 Analysis of Results
+</h1>
+
+<p>
+In the following sections, we present five analyses whereby we try to
+uncover the root causes and impact of incomplete error propagation.
+Since the number of unchecked and overwritten error codes is small, we
+only consider unsaved error codes (bad calls) in our analyses; thus we
+use "bad calls" and "broken channels" interchangeably from now on.
+First, we made a correlation between robustness and complexity.
+Second, we analyzed whether file systems and storage device drivers
+give different treatment to errors occurring in I/O read vs.&nbsp;I/O write
+operations. From that analysis we find that many write errors are
+neglected; hence we perform the next study in which we try to answer
+whether ignored errors are corner-case mistakes or intentional
+choices. In the final two analyses, we analyze whether chained error
+propagation and inter-module calls play major parts in causing
+incorrect error propagation.
+
+</p><p>
+
+</p><h2><a name="SECTION00051000000000000000"><br>
+4.1 Complexity and Robustness</a>
+</h2>
+
+<p>
+
+</p><p>
+<br></p><div align="CENTER">
+<table cellpadding="3" border="1" align="CENTER">
+<tbody><tr><td align="CENTER"><font size="-1">
+ </font></td>
+<td align="CENTER" colspan="2"><font size="-1"> </font><font size="-1"><b>By % Broken</b></font></td>
+<td align="CENTER" colspan="2"><font size="-1"> </font><font size="-1"><b>By Viol/Kloc</b></font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+
+Rank </font></td>
+<td align="LEFT"><font size="-1"> FS </font></td>
+<td align="RIGHT"><font size="-1"> Frac. </font></td>
+<td align="LEFT" colspan="2"><font size="-1"> FS&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Viol/Kloc</font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+
+ 1 </font></td>
+<td align="LEFT"><font size="-1"> IBM JFS </font></td>
+<td align="RIGHT"><font size="-1"> 24.4 </font></td>
+<td align="LEFT"><font size="-1"> ext3 </font></td>
+<td align="RIGHT"><font size="-1"> 7.2 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 2 </font></td>
+<td align="LEFT"><font size="-1"> ext3 </font></td>
+<td align="RIGHT"><font size="-1"> 22.1 </font></td>
+<td align="LEFT"><font size="-1"> IBM JFS </font></td>
+<td align="RIGHT"><font size="-1"> 5.6 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 3 </font></td>
+<td align="LEFT"><font size="-1"> JFFS v2 </font></td>
+<td align="RIGHT"><font size="-1"> 15.7 </font></td>
+<td align="LEFT"><font size="-1"> NFS Client </font></td>
+<td align="RIGHT"><font size="-1"> 3.6 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 4 </font></td>
+<td align="LEFT"><font size="-1"> NFS Client </font></td>
+<td align="RIGHT"><font size="-1"> 12.9 </font></td>
+<td align="LEFT"><font size="-1"> VFS </font></td>
+<td align="RIGHT"><font size="-1"> 2.9 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 5 </font></td>
+<td align="LEFT"><font size="-1"> CIFS </font></td>
+<td align="RIGHT"><font size="-1"> 12.7 </font></td>
+<td align="LEFT"><font size="-1"> JFFS v2 </font></td>
+<td align="RIGHT"><font size="-1"> 2.2 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 6 </font></td>
+<td align="LEFT"><font size="-1"> MemMgmt </font></td>
+<td align="RIGHT"><font size="-1"> 11.4 </font></td>
+<td align="LEFT"><font size="-1"> CIFS </font></td>
+<td align="RIGHT"><font size="-1"> 2.1 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 7 </font></td>
+<td align="LEFT"><font size="-1"> ReiserFS </font></td>
+<td align="RIGHT"><font size="-1"> 10.5 </font></td>
+<td align="LEFT"><font size="-1"> MemMgmt </font></td>
+<td align="RIGHT"><font size="-1"> 2.0 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 8 </font></td>
+<td align="LEFT"><font size="-1"> VFS </font></td>
+<td align="RIGHT"><font size="-1"> 8.4 </font></td>
+<td align="LEFT"><font size="-1"> ReiserFS </font></td>
+<td align="RIGHT"><font size="-1"> 1.8 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+ 9 </font></td>
+<td align="LEFT"><font size="-1"> NTFS </font></td>
+<td align="RIGHT"><font size="-1"> 8.1 </font></td>
+<td align="LEFT"><font size="-1"> XFS </font></td>
+<td align="RIGHT"><font size="-1"> 1.4 </font></td>
+</tr>
+<tr><td align="CENTER"><font size="-1">
+10 </font></td>
+<td align="LEFT"><font size="-1"> XFS </font></td>
+<td align="RIGHT"><font size="-1"> 6.9 </font></td>
+<td align="LEFT"><font size="-1"> NFS Server </font></td>
+<td align="RIGHT"><font size="-1"> 1.2 </font></td>
+</tr>
+</tbody></table>
+
+</div>
+<br>
+<a name="table-analysis-robust"></a>
+
+<font size="-1"><i>
+Table 3: <b>Least Robust File Systems.</b> The table
+shows the ten least robust file systems using two ranking systems. In
+the first ranking system, file system robustness is ranked based on
+the fraction of broken channels over all error channels (the 5th
+column of Table 2). The second ranking system
+sorts file systems based on the number of broken channels found in
+every Kloc (the 6th column of Table 2).}
+</i></font><br>
+
+<br>
+
+<p>
+
+</p><p>
+In our first analysis, we would like to correlate the number of
+mistakes in a subsystem with the complexity of that subsystem. For
+file systems, XFS with 71 Kloc has more mistakes than other, smaller
+file systems. However, it is not necessary that XFS is seen as the
+least robust file system. Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-analysis-robust">3</a> sorts the
+robustness of each file system based on two rankings. In both
+rankings, we only account file systems that are at least 10 Kloc in
+size with at least 50 error-related calls, <i>i.e.</i>&nbsp;we only consider
+"complex" file systems.
+
+</p><p>
+A noteworthy observation is that ext3 and IBM JFS are ranked as the
+two least robust file systems. This fact affirms our earlier findings
+on the robustness of ext3 and IBM JFS&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#PrabhakaranEtAl05-SOSP">20</a>].
+In this prior work, we found that ext3 and IBM JFS are inconsistent in
+dealing with different kinds of disk failures. Thus, it might be the
+case that these inconsistent policies correlate with inconsistent
+error propagation.
+
+</p><p>
+Among storage device drivers, it is interesting to compare the
+robustness of the SCSI and IDE subsystems. If we compare SCSI and IDE
+subsystems using the first ranking system, SCSI and IDE are almost
+comparable (21% vs.&nbsp;18%). However, if we compare them based on the
+second ranking system, then the SCSI subsystem is almost four times
+more robust than IDE (0.6 vs.&nbsp;2.1 errors/Kloc). Nevertheless it seems
+the case that SCSI utilizes basic error codes much more than IDE does.
+
+</p><p>
+When the robustness of storage drivers and file systems is compared
+using the first ranking, on average storage drivers are less robust
+compared to file systems (22% vs.&nbsp;17%, as reported in the last rows
+of Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-result-all">2</a>). On the other hand, in the second
+ranking system, storage drivers are more robust compared to file
+systems (1.1 vs.&nbsp;2.4 mistakes/Kloc). From our point of view, the
+first ranking system is more valid because a subsystem could be
+comprised of submodules that do not necessarily use error codes; what
+is more important is the number of bad calls in the population of all
+error-related calls.
+
+</p><p>
+
+</p><h2><a name="SECTION00052000000000000000"></a>
+<a name="sec-analysis-neglected"></a><br>
+4.2 Neglected Write Errors
+</h2>
+
+<p>
+As mentioned in Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-result-silent">3.2</a>, we have observed that
+error codes propagated in <tt><font size="-1">write</font></tt> or <tt><font size="-1">sync</font></tt> operations are often
+ignored. Thus, we investigate how many write errors are neglected
+compared to read errors. This study is motivated by our findings in
+that section as well as by our earlier findings where we found that at
+least for ext3, read failures are detected, but write errors are often
+ignored&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#PrabhakaranEtAl05-SOSP">20</a>].
+
+</p><p>
+To perform this study, we filter out calls that do not relate to read
+and write operations. Since it is impractical to do that manually, we
+use a simple string comparison to mark calls that are relevant to our
+analysis. That is we only take a caller4#4callee pair
+where the callee contains the string <tt><font size="-1">read</font></tt>, <tt><font size="-1">write</font></tt>,
+<tt><font size="-1">sync</font></tt>, or <tt><font size="-1">wait</font></tt>. We include <tt><font size="-1">wait</font></tt>-type calls because in
+many cases <tt><font size="-1">wait</font></tt>-type callees (<i>e.g.</i>, <tt><font size="-1">filemap_datawait</font></tt>)
+represent waiting for one or more I/O operations and could return
+error information on the operation. Thus, in our study,
+<tt><font size="-1">write</font></tt>-, <tt><font size="-1">sync</font></tt>-, and <tt><font size="-1">wait</font></tt>-type calls are categorized as
+write operations.
+
+</p><p>
+<br></p><div align="CENTER">
+<table cellpadding="3" border="1" align="CENTER">
+<tbody><tr><td align="CENTER" colspan="1"><font size="-1">
+ </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ Bad </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ EC </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ </font><font size="-1"><b>Frac.</b></font></td>
+</tr>
+<tr><td align="CENTER" colspan="1"><font size="-1">
+ Callee Type</font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ Calls </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ Calls </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ </font><font size="-1"><b>(%)</b></font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+
+Read<sup>*</sup> </font></td>
+<td align="RIGHT"><font size="-1"> 26 </font></td>
+<td align="RIGHT"><font size="-1"> 603 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>4.3</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Sync </font></td>
+<td align="RIGHT"><font size="-1"> 70 </font></td>
+<td align="RIGHT"><font size="-1"> 236 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>29.7</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Wait </font></td>
+<td align="RIGHT"><font size="-1"> 27 </font></td>
+<td align="RIGHT"><font size="-1"> 70 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>38.6</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Write </font></td>
+<td align="RIGHT"><font size="-1"> 80 </font></td>
+<td align="RIGHT"><font size="-1"> 598 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>13.4</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Sync+Wait+Write </font></td>
+<td align="RIGHT"><font size="-1"> 177 </font></td>
+<td align="RIGHT"><font size="-1"> 904 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>19.6</b> </font></td>
+</tr>
+<tr><td align="CENTER" colspan="1"><font size="-1">
+
+ Specific Callee</font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+
+</font><tt><font size="-1">filemap_fdatawait</font></tt><font size="-1"> </font></td>
+<td align="RIGHT"><font size="-1"> 22 </font></td>
+<td align="RIGHT"><font size="-1"> 29 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>75.9</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+</font><tt><font size="-1">filemap_fdatawrite</font></tt><font size="-1"> </font></td>
+<td align="RIGHT"><font size="-1"> 30 </font></td>
+<td align="RIGHT"><font size="-1"> 47 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>63.8</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+</font><tt><font size="-1">sync_blockdev</font></tt><font size="-1"> </font></td>
+<td align="RIGHT"><font size="-1"> 15 </font></td>
+<td align="RIGHT"><font size="-1"> 21 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>71.4</b> </font></td>
+</tr>
+</tbody></table>
+
+</div>
+<br>
+<a name="table-ignored-writes"></a>
+
+<font size="-1"><i>
+Table 4: <b>Neglected write errors in file system code.</b>
+The table shows that read errors are handled more correctly than
+write errors. The upper table shows the fraction of bad calls over
+four category of calls: read, sync, wait, and write. The later three
+can be categorized as a write operation. The lower table shows
+neglected write errors for three specific functions. The 29 (*)
+violated read calls are all related to readahead and asynchronous
+read; in other words, all error codes returned in synchronous reads
+are being saved and checked.
+</i></font>
+<br>
+
+<br>
+
+<p>
+
+</p><p>
+The upper half of Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-ignored-writes">4</a> reports our
+findings. The last column shows how often errors are ignored in the
+file system code. Interestingly, file systems have a tendency to
+correctly handle error codes propagated from <tt><font size="-1">read</font></tt>-type calls, but
+not those from <tt><font size="-1">write</font></tt>-type calls (4.3% vs.&nbsp;19.6%). The 29
+(4.3%) unsaved read error codes are all found in readahead
+operations in the memory management subsystem; it might be acceptable
+to ignore prefetch read errors because such reads can be reissued in
+the future whenever the page is actually read.
+
+</p><p>
+As discussed in Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-result-unsaved">3.1</a>, a function could
+return more than one error code at the same time, and checking only
+one of them suffices. However, if we know that a certain function only
+returns a single error code and yet the caller does not save the
+return value properly, then we would know that such call is really a
+flaw. To find real flaws in the file system code, we examined three
+important functions that we know only return single error codes:
+<tt><font size="-1">sync_blockdev</font></tt>, <tt><font size="-1">filemap_fdatawrite</font></tt>, and
+<tt><font size="-1">filemap_fdatawait</font></tt>. A file system that does not check the
+returned error codes from these functions would obviously let failures
+go unnoticed in the upper layers.
+
+</p><p>
+The lower half of Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-ignored-writes">4</a> reports our
+findings. Many error codes returned from the three methods are simply
+not saved (&gt; 63% in all cases). Two conclusions might be drawn from
+this observation. First, this could suggest that higher-level recovery
+code does not exist (since if it exists, it will not be invoked due to
+the broken error channel), or it could be the case that errors are
+intentionally neglected. We consider this second possibility in
+greater detail in the next section.
+
+</p><p>
+
+</p><h2><a name="SECTION00053000000000000000"></a>
+<a name="sec-analysis-inconsistent"></a><br>
+4.3 Inconsistent Calls: Corner Case or Majority?
+</h2>
+
+<p>
+</p><p>
+In this section, we consider the nature of <em>inconsistent</em> calls.
+For example, we found that 1 out of 33 calls to
+<tt><font size="-1">ide_setup_pci_device</font></tt> does not save the return value. One would
+probably consider this single call as an inconsistent implementation
+because the majority of the calls to that function save the return
+value. On the other hand, we also found that 53 out of 54 calls to
+<tt><font size="-1">unregister_filesystem</font></tt> do not save the return error codes.
+Assuming that most kernel developers are essentially competent, this
+suggests that it may actually be safe to not check the error code
+returned from this particular function.
+
+</p><p>
+To quantify inconsistent calls, we define the <em>inconsistent call
+frequency</em> of a function as the ratio of bad calls over all
+error-related calls to the function, and correlate this frequency with
+the number of bad calls to the function. For example, the
+inconsistent call frequencies for <tt><font size="-1">ide_setup_pci_blockdev</font></tt> and
+<tt><font size="-1">unregister_filesystem</font></tt> are 3% (1/33) and 98% (53/54)
+respectively and the numbers of bad calls are 1 and 53 respectively.
+
+</p><p>
+Figure&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#fig-analysis-inconsistent">6</a> plots the cumulative
+distribution function of this behavior. The graph could be seen as a
+means to prioritize which bad calls to fix first. Bad calls that fall
+below the 20% mark could be treated as <em>corner cases</em>, <i>i.e.</i>&nbsp;we
+should be suspicious on one bad call in the midst of four good calls
+to the same function. On the other hand, bad calls that fall above the
+80% mark could hint that either different developers make the same
+mistake and ignore it, or it is probably safe to make such a mistake.
+
+</p><p>
+
+</p><div align="CENTER">
+
+<p><a name="fig-analysis-inconsistent"></a></p><div align="CENTER">
+<img src="./Error Handling is Ocassionally Correct_files/fig-analysis-cdf.gif">
+</div>
+ <br>
+<font size="-1"><i>
+Figure 6: <b>Inconsistent calls frequency.</b>
+The figure shows that inconsistent calls are not corner-case bugs.
+The x-axis represents the inconsistent-call frequency of a function.
+x=20% means that there is one bad call out of five total calls;
+x=80% means that there are four bad calls out of five total calls.
+The left y-axis counts the cumulative number of bad calls. For example,
+below the 20% mark, there are 80 bad calls that have an
+inconsistent-call frequency of less than 20%.
+As reported in
+Table 2, there exist a total of 1153 bad calls.
+The right y-axis shows the cumulative fraction of bad calls over
+the 1153 bad calls. </i></font>
+
+<br>
+
+</div>
+
+
+<p>
+One perplexing phenomenon visible in the graph is that around 871 bad
+calls fall above the 50% mark. In other words, they cannot be
+considered as corner-case bugs; the developers might be aware of these
+bad calls, but probably just ignore them. One thing we have learned
+from our recent work on file system code is that if a file system does
+not know how to recover from a failure, it has the tendency to just
+ignore the error code. For example, ext3 ignores write failures during
+checkpointing simply because it has no recovery mechanism (<i>e.g.</i>,
+chained transactions&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#GunawiEtAl07-IOShepherd">12</a>]) to deal with such
+failures. Thus, we suspect that there are deeper design shortcomings
+behind poor error code handling; error code mismanagement may be as
+much symptom as disease.
+
+</p><p>
+Our analysis is similar to the work of Engler&nbsp;<i>et al.</i> on findings bugs
+automatically&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#EnglerEtAl01-Bugs">8</a>]. In their work, they use
+existing implementation to imply beliefs and facts. Applying their
+analysis to our case, the bad calls that fall above the 80% mark
+might be considered as good calls.
+However, since we are analyzing the specific problem of error
+propagation, we use that semantic knowledge and demand a discipline
+that promotes checking an error code in all circumstances, rather than
+one that follows majority rules.
+
+</p><p>
+
+</p><h2><a name="SECTION00054000000000000000"></a>
+<a name="sec-analysis-characteristic"></a><br>
+4.4 Characteristics of Error Channels
+</h2>
+
+<p>
+Finally, we study whether the characteristic of an error channel has
+an impact on the robustness of error code propagation in that channel.
+In particular, we explore two characteristics of error channels: one
+based on the error propagation distance and one based on the location
+distance (inter- vs.&nbsp;intra-file calls).
+
+</p><p>
+With the first characteristic, we would like to find out whether error
+codes are lost near the generation endpoint or somewhere in the middle
+of the propagation chain. We distinguish two calls: direct-error and
+propagate-error calls. In a <em>direct-error call</em>, the callee is an
+error-generation endpoint. In a <em>propagate-error call</em>, the
+callee is not a generation endpoint; rather it is a function that
+propagates an error code from one of the functions that it calls,
+<i>i.e.</i>&nbsp;it is a function in the middle of the propagation chain. Next, we
+define a <em>bad</em> direct-error (or propagate-error) call as a
+direct-error (or propagate-error) call that does not save the returned
+error code.
+
+</p><p>
+Initially, we assumed that the frequency of bad propagate-error calls
+would be higher than that of bad direct-error calls; we assumed error
+codes tend to be dropped in the middle of the chain rather than near
+the generation endpoint. It turns out that the number of bad
+direct-error and propagate-error calls are similar for file system
+code but the other way around for storage driver code. In particular,
+for file systems, the ratio of bad over all direct-error calls is
+10%, and the ratio of bad over all propagate-error calls is 14%. For
+storage drivers, they are 20% and 15% respectively.
+
+</p><p>
+
+</p><p>
+<br></p><div align="CENTER">
+<table cellpadding="3" border="1" align="CENTER">
+<tbody><tr><td align="CENTER" colspan="1"><font size="-1">
+ </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ Bad </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ EC </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ </font><font size="-1"><b>Frac.</b> </font></td>
+</tr>
+<tr><td align="CENTER" colspan="1"><font size="-1">
+ </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ Calls </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ Calls </font></td>
+<td align="CENTER" colspan="1"><font size="-1">
+ </font><font size="-1"><b>(%)</b> </font></td>
+</tr>
+<tr><td align="CENTER" colspan="4"><font size="-1">
+
+ </font><font size="-1"><em>File Systems</em> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+
+Inter-module </font></td>
+<td align="RIGHT"><font size="-1"> 307 </font></td>
+<td align="RIGHT"><font size="-1"> 1944 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>15.8</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Inter-file </font></td>
+<td align="RIGHT"><font size="-1"> 367 </font></td>
+<td align="RIGHT"><font size="-1"> 2786 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>13.2</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Intra-file </font></td>
+<td align="RIGHT"><font size="-1"> 159 </font></td>
+<td align="RIGHT"><font size="-1"> 2548 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>6.2</b> </font></td>
+</tr>
+<tr><td align="CENTER" colspan="4"><font size="-1">
+
+ </font><font size="-1"><em>Storage Drivers</em> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+
+Inter-module </font></td>
+<td align="RIGHT"><font size="-1"> 48 </font></td>
+<td align="RIGHT"><font size="-1"> 199 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>24.1</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Inter-file </font></td>
+<td align="RIGHT"><font size="-1"> 92 </font></td>
+<td align="RIGHT"><font size="-1"> 495 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>18.6</b> </font></td>
+</tr>
+<tr><td align="LEFT"><font size="-1">
+Intra-file </font></td>
+<td align="RIGHT"><font size="-1"> 180 </font></td>
+<td align="RIGHT"><font size="-1"> 1050 </font></td>
+<td align="RIGHT"><font size="-1"> </font><font size="-1"><b>17.1</b> </font></td>
+</tr>
+</tbody></table>
+
+</div>
+<br>
+<a name="table-inter-module"></a>
+
+<font size="-1"><i>
+Table 5: <b>Calls based on location distance.</b> The
+table shows that the fraction of bad calls in inter-module calls is
+higher than the one in inter-file calls. Similarly, inter-file calls
+are less robust than intra-file calls. Note that "inter-file"
+refers to cross-file calls within the same module. Inter-file calls across
+different modules are categorized as inter-module. </i></font>
+
+<br>
+
+<br>
+
+<p>
+
+</p><p>
+Lastly, in the second characteristic, we categorized calls based on
+the location distance between a caller and a callee. In particular, we
+distinguish three calls: inter-module, inter-file (but within the same
+module), and intra-file calls. Table&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#table-inter-module">5</a> reports
+that intra-file calls are more robust than inter-file calls, and
+inter-file calls are more robust than intra-file calls. For example,
+out of 1944 inter-module calls in which error codes propagate in file
+system, 307 (16%) of them are bad calls. However, out of 2786
+inter-file calls within the same module, there are only 367 (13%) bad
+calls. Intra-file calls only exhibit 6% bad calls. The same pattern
+occurs in storage device drivers. Thus, we conclude that the location
+distance between the caller and the callee plays a role in the
+robustness of the call.
+
+</p><p>
+
+</p><p>
+
+</p><h1><a name="SECTION00060000000000000000"></a>
+<a name="sec-future"></a><br>
+5 Future Work
+</h1>
+
+<p>
+In this section, we discuss some of the issues we previously deferred
+regarding how to build complete and accurate static error propagation
+analysis. In general, we plan to refine our static analysis with the
+intention of uncovering more violations within the file and storage
+system stack.
+
+</p><p>
+
+</p><h2><a name="SECTION00061000000000000000"></a>
+<a name="sec-future-overwritten"></a><br>
+5.1 Overwritten Error Codes
+</h2>
+
+<p>
+In this paper, we examined broken channels that are caused by unsaved
+and unchecked error codes; broken channels can also be caused by <em>overwritten error codes</em>, in which the container that holds the error
+code is overwritten with another value before the previous error is
+checked. For example, the CIFS code below overwrites (line 6) the
+previous error code received from another call (line 4).
+
+</p><p>
+</p><pre> 1 // cifs/transport.c
+ 2 int SendReceive () {
+ 3 int rc;
+ 4 rc = cifs_sign_smb(); // PROPAGATE E.C.
+ 5 ... // No use of 'rc' here
+ 6 rc = smb_send(); // OVERWRITTEN
+ 7 }
+</pre>
+
+<p>
+Currently, EDP detects overwritten error codes, but reports too many
+false positives to be useful. We are in the process of fine-tuning
+EDP so that it provides more accurate output. The biggest problem we
+have encountered is due to the nature of the error hierarchy: in many
+cases, a less critical error code is overwritten with a more critical
+one. For example, in the memory management code below, when first
+encountering a page error, the error code is set to <tt><font size="-1">EIO</font></tt> (line 6).
+Later, the function checks whether the flags of a <tt><font size="-1">map</font></tt> structure
+carry a no-space error code (line 8). If so, the <tt><font size="-1">EIO</font></tt> error code
+is overwritten (line 9) with a new error code <tt><font size="-1">ENOSPC</font></tt>.
+
+</p><p>
+</p><pre> 1 // mm/filemap.c
+ 2 int wait_on_page_writeback_range (pg, map) {
+ 3 int ret = 0;
+ 4 ...
+ 5 if (PageError(pg))
+ 6 ret = -EIO;
+ 7 ...
+ 8 if (test_bit(AS_ENOSPC, &amp;map-&gt;flags))
+ 9 ret = -ENOSPC;
+ 10 if (test_bit (AS_EIO, &amp;map-&gt;flags))
+ 11 ret = -EIO;
+ 12 return ret;
+ 13 }
+</pre>
+
+<p>
+Manually inspecting the results obtained from EDP, we have identified
+five real cases of overwritten error codes: one each in AFS and FAT,
+and three in CIFS. We believe we will find more cases as we fine-tune
+our analysis of overwritten error codes.
+
+</p><p>
+
+</p><h2><a name="SECTION00062000000000000000"></a>
+<a name="sec-future-transform"></a><br>
+5.2 Error Transformation
+</h2>
+
+<p>
+Our current EDP analysis focuses on the basic error codes that are
+stored and propagated mainly in integer containers. However, file and
+storage systems also use other specific error codes stored in complex
+structures that can be mapped to other error codes in new error
+containers; we call this issue <em>error transformation</em>. For
+example, the block layer clears the <tt>uptodate</tt> bit stored in a
+buffer structure to signal I/O failure, while the VFS layer simply
+uses generic error codes such as <tt><font size="-1">EIO</font></tt> and <tt><font size="-1">EROFS</font></tt>. We have observed a
+path where an error container changes five times, involving four
+different types of containers. A complete EDP analysis must recognize
+all transformations. With a more complete analysis, we expect to see
+even more violations.
+
+</p><p>
+
+</p><h2><a name="SECTION00063000000000000000"></a>
+<a name="sec-future-channel"></a><br>
+5.3 Asynchronous Error Channels
+</h2>
+
+<p>
+Finally, we plan to expand our definition of error channels to include
+<em>asynchronous paths</em>. We briefly describe two examples of
+asynchronous paths and their complexities. First, when a lower layer
+interrupts an upper one to notify it of the completion of an I/O, the
+low-level I/O error code is usually stored in a structure located in
+the heap; the receiver of the interrupt should grab the structure and
+check the error it carries, but tracking this propagation through the
+heap is not straightforward. Another example occurs during
+journaling: a journal daemon is woken up somewhere in the <tt><font size="-1">fsync()</font></tt>
+path and propagates a journal error code via a global journal state.
+When we consider asynchronous error channels, we also expect the
+number of violations to increase.
+
+</p><p>
+
+</p><p>
+
+</p><h1><a name="SECTION00070000000000000000"></a>
+<a name="sec-related"></a><br>
+6 Related Work
+</h1>
+
+<p>
+Previous work has used static techniques to understand variety of
+problems in software systems. For example, Meta-level compilation
+(MC)&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#EnglerEtAl00-SystemRules">7</a>,<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#EnglerEtAl01-Bugs">8</a>] enables a
+programmer to write simple, system-specific compiler extensions to
+automatically check software for rule violations. With their work, one
+can find broken channels by specifying a rule such as "a returned
+variable must be checked."
+Compared to their work, ours presents more information on how error
+propagates and convert it into graphical output for ease of analysis
+and debugging.
+
+</p><p>
+Another related project is FiSC&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#YangEtAl04-FSErrors">32</a>], which uses
+the model-checking tool CMC&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#MusuvathiEtAl02-CMC">17</a>] to find file
+system errors in the Linux kernel. Every time the file system under
+test transitions to a new state, FiSC runs a series of invariant
+checkers looking for file system errors. If an error is found, one can
+trace back the states and diagnose the sequence of actions that lead
+to the error. One aspect of our work that is similar to FiSC is that
+we unearth silent failures.
+For example, FiSC detects a bug where a system call returns success
+after it calls a resource allocation routine that fails, <i>e.g.</i>&nbsp;due to
+memory failures.
+
+</p><p>
+In recent work, Johansson analyzes run-time error propagation based on
+interface observations&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#JohanssonSuri05-ErrorProfiling">14</a>].
+Specifically, an error is injected at the OS-driver interface by
+changing the value of a data parameter. By observing the
+application-OS interface after the error injection, they reveal
+whether errors occurring in the OS environment (device drivers) will
+propagate through the OS and affect applications. This run-time
+technique is complementary to our work, especially to uncover the
+eventual bad effects of error-broken channels.
+
+</p><p>
+Solving the error propagation problem is also similar to solving the
+problem of unchecked exceptions. Sacramento <i>et al.</i> found too many
+unchecked exceptions, thus doubting programmers' assurances in
+documenting exceptions&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#SacramentoEtAl06-Exception">25</a>].
+Nevertheless, since using exceptions is not a kernel programming
+style, at least at the current state, solutions to the problem of
+unchecked exceptions might not be applicable to kernel code. Only
+recently is there an effort in employing exceptions in OS
+code&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#CabralMarques06-Exception">3</a>].
+
+</p><p>
+Our tool is also similar to
+Jex&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#RobillardMurphy00-RobustJava">24</a>]. While Jex is a static
+analysis tool that determines exception flow information in Java
+programs, our tool determines the error code flow information within
+the Linux kernel.
+
+</p><p>
+To fix the incomplete error propagation problem, developers could
+simply adopt a simple set-check-use
+methodology&nbsp;[<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#BigriggVos02-SetCheckUse">2</a>]. However, it is
+interesting to see that this simple practice has not been applied
+thoroughly in file systems and storage device drivers. As mentioned
+in Section&nbsp;<a href="https://www.usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html#sec-analysis-inconsistent">4.3</a>, we suspect that there are
+deeper design shortcomings behind poor error code handling.
+
+</p><p>
+
+</p><p>
+
+</p><h1><a name="SECTION00080000000000000000"></a>
+<a name="sec-conclude"></a><br>
+7 Conclusion
+</h1>
+
+<p>
+In this paper, we have analyzed the file and storage systems in Linux 2.6 and
+found that error codes are not consistently propagated. We conclude by
+reprinting some developer comments we found near some problematic cases:
+
+</p><p>
+
+</p><p>
+
+</p><blockquote><font size="-1">CIFS -
+<em>"Not much we can do if it fails anyway, ignore rc." </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">CIFS -
+<em>"Should we pass any errors back?" </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">ext3 -
+<em>"Error, skip block and hope for the best." </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">ext3 -
+<em>"There's no way of reporting error returned from
+ext3_mark_inode_dirty() to userspace. So ignore it." </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">IBM JFS -
+<em>"Note: todo: log error handler." </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">ReiserFS -
+<em>"We can't do anything about an error here." </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">XFS -
+<em>"Just ignore errors at this point. There is
+nothing we can do except to try to keep going." </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">SCSI -
+<em>"Retval ignored?" </em>
+</font></blockquote>
+<p>
+
+</p><blockquote><font size="-1">SCSI -
+<em>"Todo: handle failure." </em>
+</font></blockquote>
+<p>
+
+
+</p><p>
+These comments from developers indicate part of the problem: even when the
+developers are aware they are not properly propagating an error, they do not
+know how to implement the correct response. Given static analysis tools to
+identify the source of bugs (such as EDP), developers may still not be able to
+fix all bugs in a straightforward manner.
+
+</p><p>
+Due to these observations, we believe it is thus time to rethink how
+failures are managed in large systems. Preaching that developers
+follow error handling conventions and hoping the resulting systems
+work as desired seems naive at best. New approaches to error
+detection, propagation, and recovery are needed; in the future, we
+plan to explore a range of error architectures, hoping to find methods
+that increase the level of robustness in the storage systems upon
+which we all rely.
+
+</p><p>
+
+</p><p>
+
+</p><h1><a name="SECTION00090000000000000000"><br>
+Acknowledgments</a>
+</h1>
+
+<p>
+We thank the members of the ADSL research group for their insightful
+comments. We would also like to thank Geoff Kuenning (our shepherd)
+and the anonymous reviewers for their excellent feedback and comments,
+many of which have greatly improved this paper.
+The second author wishes to thank the National Council on Science and
+Technology of Mexico
+and the Secretariat of Public Education
+for their financial support.
+
+</p><p>
+This work is supported by the National Science Foundation
+under the following grants:
+CCF-0621487,
+CNS-0509474,
+CCR-0133456,
+as well as by generous donations from Network Appliance and Sun Microsystems.
+
+</p><p>
+Any opinions, findings, and conclusions or recommendations expressed
+in this material are those of the authors and do not necessarily
+reflect the views of NSF or other institutions.
+
+</p><p>
+
+</p><p>
+ <font size="-1">
+ </font>
+</p><h1><a name="SECTION000100000000000000000"><br>
+Bibliography</a>
+</h1><dl compact=""><dd><p></p></dd><dt><a name="Best00-JFS-Local">1</a>
+</dt><dd>
+Steve Best.
+<br>JFS Overview.
+<br>www.ibm.com/developer works/library/l-jfs.html, 2000.
+
+<p></p></dd><dt><a name="BigriggVos02-SetCheckUse">2</a>
+</dt><dd>
+Michael&nbsp;W. Bigrigg and Jacob&nbsp;J. Vos.
+<br>The Set-Check-Use Methodology for Detecting Error Propagation
+ Failures in I/O Routines.
+<br>In <em>WDB '02</em>, Washington, DC, June 2002.
+
+<p></p></dd><dt><a name="CabralMarques06-Exception">3</a>
+</dt><dd>
+Bruno Cabral and Paulo Marques.
+<br>Making Exception Handling Work.
+<br>In <em>HotDep II</em>, Seattle, Washington, Nov 2006.
+
+<p></p></dd><dt><a name="CandeaEtAl04-Reboot">4</a>
+</dt><dd>
+George Candea, Shinichi Kawamoto, Yuichi Fujiki, Greg Friedman, and Armando
+ Fox.
+<br>Microreboot - A Technique for Cheap Recovery.
+<br>In <em>OSDI '04</em>, pages 31-44, San Francisco, CA, December 2004.
+
+<p></p></dd><dt><a name="CowanEtAl98-Stackguard">5</a>
+</dt><dd>
+Crispin Cowan, Calton Pu, Dave Maier, Heather Hinton, Jonathan Walpole, Peat
+ Bakke, Steve Beattie, Aaron Grier, Perry Wagle, and Qian Zhang.
+<br>StackGuard: Automatic adaptive detection and prevention of
+ buffer-overflow attacks.
+<br>In <em>USENIX '98 Security</em>, San Antonio, TX, January 1998.
+
+<p></p></dd><dt><a name="EllardMegquier05-DISP">6</a>
+</dt><dd>
+Daniel Ellard and James Megquier.
+<br>DISP: Practical, Efficient, Secure, and Faul-Tolerant Distributed
+ Data Storage.
+<br><em>ACM Transactions on Storage (TOS)</em>, 1(1):71-94, Feb 2005.
+
+<p></p></dd><dt><a name="EnglerEtAl00-SystemRules">7</a>
+</dt><dd>
+Dawson Engler, Benjamin Chelf, Andy Chou, and Seth Hallem.
+<br>Checking System Rules Using System-Specific, Programmer-Written
+ Compiler Extensions .
+<br>In <em>OSDI '00</em>, San Diego, CA, October 2000.
+
+<p></p></dd><dt><a name="EnglerEtAl01-Bugs">8</a>
+</dt><dd>
+Dawson Engler, David&nbsp;Yu Chen, Seth Hallem, Andy Chou, and Benjamin Chelf.
+<br>Bugs as Deviant Behavior: A General Approach to Inferring Errors in
+ Systems Code.
+<br>In <em>SOSP '01</em>, pages 57-72, Banff, Canada, October 2001.
+
+<p></p></dd><dt><a name="EnglerDunbar07-UnderConstrained">9</a>
+</dt><dd>
+Dawson&nbsp;R. Engler and Daniel Dunbar.
+<br>Under-constrained execution: making automatic code destruction easy
+ and scalable.
+<br>In <em>ISSTA '07</em>, London, United Kingdom, July 2007.
+
+<p></p></dd><dt><a name="GodefroidEtAl05-DART">10</a>
+</dt><dd>
+Patrice Godefroid, Nils Klarlund, and Koushik Sen.
+<br>DART: Directed Automated Random Testing.
+<br>In <em>PLDI '05</em>, Chicago, IL, June 2005.
+
+<p></p></dd><dt><a name="EdpOutput">11</a>
+</dt><dd>
+Haryadi&nbsp;S. Gunawi.
+<br>EDP Output for All File Systems.
+<br>www.cs.wisc.edu/adsl/Publications/eio-fast08/ readme.html.
+
+<p></p></dd><dt><a name="GunawiEtAl07-IOShepherd">12</a>
+</dt><dd>
+Haryadi&nbsp;S. Gunawi, Vijayan Prabhakaran, Swetha Krishnan, Andrea&nbsp;C.
+ Arpaci-Dusseau, and Remzi&nbsp;H. Arpaci-Dusseau.
+<br>Improving File System Reliability with I/O Shepherding.
+<br>In <em>SOSP '07</em>, pages 283-296, Stevenson, Washington, October
+ 2007.
+
+<p></p></dd><dt><a name="Hind01-PointerAnalysis">13</a>
+</dt><dd>
+Michael Hind.
+<br>Pointer Analysis: Haven't We Solved This Problem Yet?
+<br>In <em>PASTE '01</em>, Snowbird, Utah, June 2001.
+
+<p></p></dd><dt><a name="JohanssonSuri05-ErrorProfiling">14</a>
+</dt><dd>
+Andreas Johansson and Neeraj Suri.
+<br>Error Propagation Profiling of Operating Systems .
+<br>In <em>DSN '05</em>, Yokohoma, Japan, June 2005.
+
+<p></p></dd><dt><a name="KolaEtAl05-FaultInLDS">15</a>
+</dt><dd>
+George Kola, Tevfik Kosar, and Miron Livny.
+<br>Faults in Large Distributed Systems and What We Can Do About Them.
+<br>In <em>Euro-Par</em>, August 2005.
+
+<p></p></dd><dt><a name="KoopmanDeVale99-POSIX">16</a>
+</dt><dd>
+Philip Koopman and John DeVale.
+<br>Comparing the Robustness of POSIX Operating Systems.
+<br>In <em>FTCS-29</em>, Madison, Wisconsin, June 1999.
+
+<p></p></dd><dt><a name="MusuvathiEtAl02-CMC">17</a>
+</dt><dd>
+Madanlal Musuvathi, David&nbsp;Y.W. Park, Andy Chou, Dawson&nbsp;R. Engler, and David&nbsp;L.
+ Dill.
+<br>CMC: A Pragmatic Approach to Model Checking Real Code.
+<br>In <em>OSDI '02</em>, Boston, MA, December 2002.
+
+<p></p></dd><dt><a name="NeculaEtAl05-CCured">18</a>
+</dt><dd>
+George&nbsp;C. Necula, Jeremy Condit, Matthew Harren, Scott McPeak, and Westley
+ Weimer.
+<br>CCured: Type-Safe Retrofitting of Legacy Software.
+<br><em>ACM Transactions on Programming Languages and Systems</em>, 27(3),
+ May 2005.
+
+<p></p></dd><dt><a name="Necula02-CIL">19</a>
+</dt><dd>
+George&nbsp;C. Necula, Scott McPeak, S.&nbsp;P. Rahul, and Westley Weimer.
+<br>Cil: An infrastructure for c program analysis and transformation.
+<br>In <em>CC '02</em>, pages 213-228, April 2002.
+
+<p></p></dd><dt><a name="PrabhakaranEtAl05-SOSP">20</a>
+</dt><dd>
+Vijayan Prabhakaran, Lakshmi&nbsp;N. Bairavasundaram, Nitin Agrawal, Haryadi&nbsp;S.
+ Gunawi, Andrea&nbsp;C. Arpaci-Dusseau, and Remzi&nbsp;H. Arpaci-Dusseau.
+<br>IRON File Systems.
+<br>In <em>SOSP '05</em>, pages 206-220, Brighton, UK, October 2005.
+
+<p></p></dd><dt><a name="QinEtAl05-Safemem">21</a>
+</dt><dd>
+Feng Qin, Shan Lu, and Yuanyuan Zhou.
+<br>Exploiting ECC-memory for detecting memory leaks and memory
+ corruption during production runs.
+<br>In <em>HPCA-11</em>, San Francisco, California, February 2005.
+
+<p></p></dd><dt><a name="QinEtAl05-Rx">22</a>
+</dt><dd>
+Feng Qin, Joseph Tucek, Jagadeesan Sundaresan, and Yuanyuan Zhou.
+<br>Rx: Treating Bugs As Allergies.
+<br>In <em>SOSP '05</em>, Brighton, UK, October 2005.
+
+<p></p></dd><dt><a name="Reiser04-ReiserFS">23</a>
+</dt><dd>
+Hans Reiser.
+<br>ReiserFS.
+<br>www.namesys.com, 2004.
+
+<p></p></dd><dt><a name="RobillardMurphy00-RobustJava">24</a>
+</dt><dd>
+Martin&nbsp;P. Robillard and Gail&nbsp;C. Murphy.
+<br>Designing Robust Java Programs with Exceptions.
+<br>In <em>FSE '00</em>, San Diego, CA, November 2000.
+
+<p></p></dd><dt><a name="SacramentoEtAl06-Exception">25</a>
+</dt><dd>
+Paulo Sacramento, Bruno Cabral, and Paulo Marques.
+<br>Unchecked Exceptions: Can the Programmer be Trusted to Document
+ Exceptions?
+<br>In <em>IVNET '06</em>, Florianopolis, Brazil, October 2006.
+
+<p></p></dd><dt><a name="SidiroglouEtAl05-STEM">26</a>
+</dt><dd>
+Stelios Sidiroglou, Michael&nbsp;E. Locasto, Stephen&nbsp;W. Boyd, and Angelos&nbsp;D.
+ Keromytis.
+<br>Building a Reactive Immune System for Software Services.
+<br>In <em>USENIX '05</em>, Anaheim, CA, April 2005.
+
+<p></p></dd><dt><a name="Solomon98-NT">27</a>
+</dt><dd>
+David&nbsp;A. Solomon.
+<br><em>Inside Windows NT</em>.
+<br>Microsoft Programming Series. Microsoft Press, 2nd edition, May 1998.
+
+<p></p></dd><dt><a name="SwiftEtAl03-Nooks">28</a>
+</dt><dd>
+Michael&nbsp;M. Swift, Brian&nbsp;N. Bershad, and Henry&nbsp;M. Levy.
+<br>Improving the Reliability of Commodity Operating Systems.
+<br>In <em>SOSP '03</em>, Bolton Landing, NY, October 2003.
+
+<p></p></dd><dt><a name="SwiftEtAl04-MoreNooks">29</a>
+</dt><dd>
+Michael&nbsp;M. Swift, Brian&nbsp;N. Bershad, and Henry&nbsp;M. Levy.
+<br>Recovering device drivers.
+<br>In <em>OSDI '04</em>, pages 1-16, San Francisco, CA, December 2004.
+
+<p></p></dd><dt><a name="ThainLivny02-ErrorScope">30</a>
+</dt><dd>
+Douglas Thain and Miron Livny.
+<br>Error Scope on a Computational Grid: Theory and Practice.
+<br>In <em>HPDC 11</em>, Edinburgh, Scotland, July 2002.
+
+<p></p></dd><dt><a name="Tweedie98-JournalingExt2">31</a>
+</dt><dd>
+Stephen&nbsp;C. Tweedie.
+<br>Journaling the Linux ext2fs File System.
+<br>In <em>The Fourth Annual Linux Expo</em>, Durham, North Carolina, May
+ 1998.
+
+<p></p></dd><dt><a name="YangEtAl04-FSErrors">32</a>
+</dt><dd>
+Junfeng Yang, Paul Twohey, Dawson Engler, and Madanlal Musuvathi.
+<br>Using Model Checking to Find Serious File System Errors.
+<br>In <em>OSDI '04</em>, San Francisco, CA, December 2004.
+</dd></dl>
+
+
+<p>
+<font size="-1"></font>
+</p><p>
+
+</p><p>
+
+<br><br><br>
+<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","licenseKey":"d823139095","applicationID":"509444","transactionName":"YVJVZksCXkEEVhIMWFgYdlFNCl9cSkAVAFlfT2hAXAdZQABWEhZoWFhDbV8MRVwB","queueTime":0,"applicationTime":150,"ttGuid":"","agentToken":"","atts":"TRVWEAMYTU8=","errorBeacon":"bam.nr-data.net","agent":""}</script>
+
+</p></body></html> \ No newline at end of file
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-analysis-cdf.gif b/reference/Error Handling is Ocassionally Correct_files/fig-analysis-cdf.gif
new file mode 100644
index 00000000..d5ae4632
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-analysis-cdf.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-method-edp.gif b/reference/Error Handling is Ocassionally Correct_files/fig-method-edp.gif
new file mode 100644
index 00000000..1e87f539
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-method-edp.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-big-legend.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-big-legend.gif
new file mode 100644
index 00000000..0af0c054
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-big-legend.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-big.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-big.gif
new file mode 100644
index 00000000..26b329b1
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-big.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-small-ext3.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-ext3.gif
new file mode 100644
index 00000000..0056b2a5
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-ext3.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-small-hfsplus.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-hfsplus.gif
new file mode 100644
index 00000000..db590184
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-hfsplus.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-small-jfs.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-jfs.gif
new file mode 100644
index 00000000..b423e830
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-jfs.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-small-nfs.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-nfs.gif
new file mode 100644
index 00000000..d0adf7d5
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-nfs.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-small-reiserfs.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-reiserfs.gif
new file mode 100644
index 00000000..3775d950
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-reiserfs.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-small-xfs.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-xfs.gif
new file mode 100644
index 00000000..ee8dfcdf
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-small-xfs.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/fig-result-zoom.gif b/reference/Error Handling is Ocassionally Correct_files/fig-result-zoom.gif
new file mode 100644
index 00000000..4887e0ec
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/fig-result-zoom.gif
Binary files differ
diff --git a/reference/Error Handling is Ocassionally Correct_files/main.css b/reference/Error Handling is Ocassionally Correct_files/main.css
new file mode 100644
index 00000000..4ea4b224
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/main.css
@@ -0,0 +1,210 @@
+<!DOCTYPE html>
+<!--[if lt IE 7 ]> <html lang="en" dir="ltr"
+ xmlns:og="http://ogp.me/ns#"
+ xmlns:article="http://ogp.me/ns/article#"
+ xmlns:book="http://ogp.me/ns/book#"
+ xmlns:profile="http://ogp.me/ns/profile#"
+ xmlns:video="http://ogp.me/ns/video#"
+ xmlns:product="http://ogp.me/ns/product#" class="no-js ie6"> <![endif]-->
+<!--[if IE 7 ]> <html lang="en" dir="ltr"
+ xmlns:og="http://ogp.me/ns#"
+ xmlns:article="http://ogp.me/ns/article#"
+ xmlns:book="http://ogp.me/ns/book#"
+ xmlns:profile="http://ogp.me/ns/profile#"
+ xmlns:video="http://ogp.me/ns/video#"
+ xmlns:product="http://ogp.me/ns/product#" class="no-js ie7"> <![endif]-->
+<!--[if IE 8 ]> <html lang="en" dir="ltr"
+ xmlns:og="http://ogp.me/ns#"
+ xmlns:article="http://ogp.me/ns/article#"
+ xmlns:book="http://ogp.me/ns/book#"
+ xmlns:profile="http://ogp.me/ns/profile#"
+ xmlns:video="http://ogp.me/ns/video#"
+ xmlns:product="http://ogp.me/ns/product#" class="no-js ie8"> <![endif]-->
+<!--[if (gte IE 9)|!(IE)]><!-->
+
+<html lang="en" dir="ltr"
+ xmlns:og="http://ogp.me/ns#"
+ xmlns:article="http://ogp.me/ns/article#"
+ xmlns:book="http://ogp.me/ns/book#"
+ xmlns:profile="http://ogp.me/ns/profile#"
+ xmlns:video="http://ogp.me/ns/video#"
+ xmlns:product="http://ogp.me/ns/product#" class="no-js">
+<!--<![endif]-->
+
+<head profile="http://www.w3.org/1999/xhtml/vocab">
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<link rel="shortcut icon" href="https://www.usenix.org/sites/default/files/waves_favicon.ico" type="image/vnd.microsoft.icon" />
+<meta name="generator" content="Drupal 7 (http://drupal.org)" />
+<link rel="canonical" href="https://www.usenix.org/" />
+<link rel="shortlink" href="https://www.usenix.org/" />
+<meta property="og:site_name" content="USENIX" />
+<meta property="og:type" content="website" />
+<meta property="og:url" content="https://www.usenix.org/" />
+<meta property="og:title" content="USENIX" />
+<!-- TODO: add the following meta tags to $head via theme_settings -->
+ <!-- Always force latest IE rendering engine (even in intranet) & Chrome Frame
+ Remove this if you use the .htaccess -->
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><script type="text/javascript">window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var o=e[n]={exports:{}};t[n][0].call(o.exports,function(e){var o=t[n][1][e];return r(o||e)},o,o.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({QJf3ax:[function(t,e){function n(){}function r(t){function e(t){return t&&t instanceof n?t:t?a(t,i,o):o()}function c(n,r,o){t&&t(n,r,o);for(var i=e(o),a=f(n),c=a.length,u=0;c>u;u++)a[u].apply(i,r);return i}function u(t,e){p[t]=f(t).concat(e)}function f(t){return p[t]||[]}function s(){return r(c)}var p={};return{on:u,emit:c,create:s,listeners:f,context:e,_events:p}}function o(){return new n}var i="nr@context",a=t("gos");e.exports=r()},{gos:"7eSDFh"}],ee:[function(t,e){e.exports=t("QJf3ax")},{}],3:[function(t,e){function n(t){return function(){r(t,[(new Date).getTime()].concat(i(arguments)))}}var r=t("handle"),o=t(1),i=t(2);"undefined"==typeof window.newrelic&&(newrelic=window.NREUM);var a=["setPageViewName","addPageAction","setCustomAttribute","finished","addToTrace","inlineHit","noticeError"];o(a,function(t,e){window.NREUM[e]=n("api-"+e)}),e.exports=window.NREUM},{1:12,2:13,handle:"D5DuLP"}],gos:[function(t,e){e.exports=t("7eSDFh")},{}],"7eSDFh":[function(t,e){function n(t,e,n){if(r.call(t,e))return t[e];var o=n();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(t,e,{value:o,writable:!0,enumerable:!1}),o}catch(i){}return t[e]=o,o}var r=Object.prototype.hasOwnProperty;e.exports=n},{}],D5DuLP:[function(t,e){function n(t,e,n){return r.listeners(t).length?r.emit(t,e,n):void(r.q&&(r.q[t]||(r.q[t]=[]),r.q[t].push(e)))}var r=t("ee").create();e.exports=n,n.ee=r,r.q={}},{ee:"QJf3ax"}],handle:[function(t,e){e.exports=t("D5DuLP")},{}],XL7HBI:[function(t,e){function n(t){var e=typeof t;return!t||"object"!==e&&"function"!==e?-1:t===window?0:i(t,o,function(){return r++})}var r=1,o="nr@id",i=t("gos");e.exports=n},{gos:"7eSDFh"}],id:[function(t,e){e.exports=t("XL7HBI")},{}],G9z0Bl:[function(t,e){function n(){if(!v++){var t=l.info=NREUM.info,e=f.getElementsByTagName("script")[0];if(t&&t.licenseKey&&t.applicationID&&e){c(p,function(e,n){t[e]||(t[e]=n)});var n="https"===s.split(":")[0]||t.sslForHttp;l.proto=n?"https://":"http://",a("mark",["onload",i()]);var r=f.createElement("script");r.src=l.proto+t.agent,e.parentNode.insertBefore(r,e)}}}function r(){"complete"===f.readyState&&o()}function o(){a("mark",["domContent",i()])}function i(){return(new Date).getTime()}var a=t("handle"),c=t(1),u=window,f=u.document;t(2);var s=(""+location).split("?")[0],p={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-852.min.js"},d=window.XMLHttpRequest&&XMLHttpRequest.prototype&&XMLHttpRequest.prototype.addEventListener&&!/CriOS/.test(navigator.userAgent),l=e.exports={offset:i(),origin:s,features:{},xhrWrappable:d};f.addEventListener?(f.addEventListener("DOMContentLoaded",o,!1),u.addEventListener("load",n,!1)):(f.attachEvent("onreadystatechange",r),u.attachEvent("onload",n)),a("mark",["firstbyte",i()]);var v=0},{1:12,2:3,handle:"D5DuLP"}],loader:[function(t,e){e.exports=t("G9z0Bl")},{}],12:[function(t,e){function n(t,e){var n=[],o="",i=0;for(o in t)r.call(t,o)&&(n[i]=e(o,t[o]),i+=1);return n}var r=Object.prototype.hasOwnProperty;e.exports=n},{}],13:[function(t,e){function n(t,e,n){e||(e=0),"undefined"==typeof n&&(n=t?t.length:0);for(var r=-1,o=n-e||0,i=Array(0>o?0:o);++r<o;)i[r]=t[e+r];return i}e.exports=n},{}]},{},["G9z0Bl"]);</script><!-- Mobile viewport optimized: j.mp/bplateviewport -->
+ <meta name="google-site-verification" content="NWZh1b4m1muqzcGTUY41ERnLVVU1U0nU1knFu0v-Y5g" />
+ <!-- <meta name="viewport" content="width=device-width, initial-scale=1.0"> -->
+
+ <title>Page not found | USENIX</title><link type="text/css" rel="stylesheet" href="https://www.usenix.org/sites/default/files/css/css_xAloA06fO9MfqH5oB7olBlEMNSCK_bXmI744B3VBaGU.css" media="all" />
+<link type="text/css" rel="stylesheet" href="https://www.usenix.org/sites/default/files/css/css_qzuwDX4H89pFw_qLsggVqpH75q_-Dnp-jVEe2X1-gBo.css" media="all" />
+<link type="text/css" rel="stylesheet" href="https://www.usenix.org/sites/default/files/css/css_JOfTSKVRI4TclH2vFChq7MD8Qs8R6fJcf5wE3J2fXt4.css" media="all" />
+<link type="text/css" rel="stylesheet" href="https://www.usenix.org/sites/default/files/css/css_1Cns3L1_TrifKTSgp644ViF6O9TZ1O0lZsCwyMt9kfE.css" media="screen" />
+<link type="text/css" rel="stylesheet" href="https://www.usenix.org/sites/default/files/css/css_47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU.css" media="print" />
+
+<!--[if lte IE 8]>
+<link type="text/css" rel="stylesheet" href="https://www.usenix.org/sites/all/themes/waves/css/ie.css?o3sy5o" media="all" />
+<![endif]-->
+<!-- <link href='http://fonts.googleapis.com/css?family=Open+Sans:400italic,600italic,800italic,600,400,800' rel='stylesheet' type='text/css'> -->
+<!-- <link href='http://fonts.googleapis.com/css?family=Noto+Sans' rel='stylesheet' type='text/css'> -->
+ <link href="./css/ie.css" media="screen, projection" rel="stylesheet" type="text/css">
+ <script type="text/javascript" src="https://www.usenix.org/sites/default/files/js/js_UWQINlriydSoeSiGQxToOUdv493zEa7dpsXC1OtYlZU.js"></script>
+<script type="text/javascript" src="https://www.usenix.org/sites/default/files/js/js_IE1tR0MJwQVLroCSS5Sy4yftAmfwZ4RgT9sBLauhG2o.js"></script>
+<script type="text/javascript" src="https://www.usenix.org/sites/default/files/js/js_E1JP4TspEoKLVi3t3MsXgycF9wMUk6_jE9-daEZvInI.js"></script>
+<script type="text/javascript">
+<!--//--><![CDATA[//><!--
+(function(i,s,o,g,r,a,m){i["GoogleAnalyticsObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,"script","https://www.usenix.org/sites/default/files/googleanalytics/analytics.js?o3sy5o","ga");ga("create", "UA-3633391-4", {"cookieDomain":"auto"});ga("set", "anonymizeIp", true);ga("set", "page", "/404.html?page=" + document.location.pathname + document.location.search + "&from=" + document.referrer);ga("send", "pageview");
+//--><!]]>
+</script>
+<script type="text/javascript" src="https://www.usenix.org/sites/default/files/js/js_RGgtt0xyoog3aEbqA3PACaZnzIbopLWdKtKDjg0Ge54.js"></script>
+<script type="text/javascript" src="https://www.usenix.org/sites/default/files/js/js_ByQqoFmos53DlLDLfNaeXBXbxiumZmAoP08wXIegdFk.js"></script>
+<script type="text/javascript">
+<!--//--><![CDATA[//><!--
+jQuery.extend(Drupal.settings, {"basePath":"\/","pathPrefix":"","ajaxPageState":{"theme":"waves","theme_token":"qoy_Y_xwpw510ookWtQkmOoLXsXaOKbpBCRYZkn11TQ","js":{"misc\/jquery.js":1,"misc\/jquery.once.js":1,"misc\/drupal.js":1,"sites\/all\/modules\/beautytips\/js\/jquery.bt.min.js":1,"sites\/all\/modules\/beautytips\/js\/beautytips.min.js":1,"sites\/all\/themes\/waves\/js\/search.js":1,"sites\/all\/themes\/waves\/js\/modernizr-1.6.min.js":1,"sites\/all\/themes\/waves\/js\/usenix.js":1,"sites\/all\/modules\/views_slideshow\/js\/views_slideshow.js":1,"sites\/all\/modules\/google_analytics\/googleanalytics.js":1,"0":1,"sites\/all\/themes\/waves\/js\/ad-blocks-leaderboard.js":1,"sites\/all\/themes\/rubik\/js\/rubik.js":1},"css":{"modules\/comment\/comment.css":1,"sites\/all\/modules\/date\/date_api\/date.css":1,"sites\/all\/modules\/date\/date_popup\/themes\/datepicker.1.7.css":1,"modules\/field\/theme\/field.css":1,"sites\/all\/modules\/mollom\/mollom.css":1,"modules\/poll\/poll.css":1,"sites\/all\/modules\/views\/css\/views.css":1,"sites\/all\/modules\/ctools\/css\/ctools.css":1,"sites\/all\/modules\/views_slideshow\/views_slideshow.css":1,"sites\/all\/modules\/biblio\/biblio.css":1,"sites\/all\/modules\/views\/css\/views-admin.seven.css":1,"sites\/all\/themes\/tao\/reset.css":1,"sites\/all\/themes\/tao\/base.css":1,"sites\/all\/themes\/waves\/css\/screen.css":1,"sites\/all\/themes\/waves\/system.css":1,"sites\/all\/themes\/waves\/views.css":1,"sites\/all\/themes\/tao\/drupal.css":1,"sites\/all\/themes\/waves\/core.css":1,"sites\/all\/themes\/waves\/icons.css":1,"sites\/all\/themes\/waves\/style.css":1,"sites\/all\/themes\/waves\/print.css":1,"sites\/all\/themes\/waves\/base.css":1,"sites\/all\/themes\/waves\/reset.css":1,"sites\/all\/themes\/waves\/css\/ie.css":1}},"beautytips":{".beautytips":{"fill":"#F4F4F4","strokeStyle":"#666666","spikeLength":20,"spikeGirth":10,"width":350,"overlap":0,"centerPointY":1,"cornerRadius":0,"cssStyles":{"fontFamily":"\u0026quot;Lucida Grande\u0026quot;,Helvetica,Arial,Verdana,sans-serif","fontSize":"12px","padding":"10px 14px"},"shadow":1,"shadowColor":"rgba(0,0,0,.5)","shadowBlur":8,"shadowOffsetX":4,"shadowOffsetY":4,"cssSelect":".beautytips","list":["fill","strokeStyle","spikeLength","spikeGirth","width","overlap","centerPointY","cornerRadius","cssStyles","shadow","shadowColor","shadowBlur","shadowOffsetX","shadowOffsetY"]}},"jcarousel":{"ajaxPath":"\/jcarousel\/ajax\/views"},"googleanalytics":{"trackOutbound":1,"trackMailto":1,"trackDownload":1,"trackDownloadExtensions":"7z|aac|arc|arj|asf|asx|avi|bin|csv|doc|epub|exe|flv|gif|gz|gzip|hqx|jar|jpe?g|js|mobi|mp(2|3|4|e?g)|mov(ie)?|msi|msp|pdf|phps|png|ppt|qtm?|ra(m|r)?|sea|sit|tar|tgz|torrent|txt|wav|wma|wmv|wpd|xls|xml|z|zip"},"urlIsAjaxTrusted":{"\/legacy\/event\/fast08\/tech\/full_papers\/gunawi\/gunawi_html\/main.css":true}});
+//--><!]]>
+</script>
+<!--[if (gte IE 6)&(lte IE 8)]><script type="text/javascript" src="sites/all/themes/waves/js/mootools-core-1.4.1-full-nocompat-yc.js"></script><script type="text/javascript" src="sites/all/themes/waves/js/selectivizr/selectivizr-min.js"></script><![endif]--><!--[if lt IE 9]><script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->
+ <script type="application/ld+json">
+ { "@context" : "http://schema.org",
+ "@type" : "Organization",
+ "name" : "USENIX Association",
+ "url" : "https://www.usenix.org",
+ "sameAs" : [ "https://www.facebook.com/usenixassociation",
+ "http://www.twitter.com/usenix",
+ "https://plus.google.com/108588319090208187909/posts",
+ "http://www.linkedin.com/groups/USENIX-Association-49559/about",
+ "http://www.youtube.com/user/USENIXAssociation"]
+ }
+ </script>
+
+</head>
+
+<body class="html not-front not-logged-in page-usenix-redirects-404 tao no-sidebars" >
+ <div id="skip-link">
+ <a href="#main-content" class="element-invisible element-focusable">Skip to main content</a>
+ </div>
+<div id='top-outer'><div id='top' class='limiter'>
+ <div id='top-inner'>
+ <div class="top-left">
+ <div id="site-name"><a href="/">USENIX</a></div>
+ </div>
+ <div class="top-right">
+ <div id='header-login'>
+ <span class="header-login-first"><a href="/user?destination=legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/main.css">Sign In</a></span><a href="/user/register?destination=legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/main.css">Create Account</a> </div>
+ <div id="search-bar">
+ <form action="/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/main.css" method="post" id="search-block-form" accept-charset="UTF-8"><div><div class="container-inline">
+ <h2 class="element-invisible">Search form</h2>
+ <div class="form-item form-type-textfield form-item-search-block-form">
+ <input title="Enter the terms you wish to search for." type="text" id="edit-search-block-form--2" name="search_block_form" value="" size="15" maxlength="128" class="form-text" />
+</div>
+<div class="form-actions form-wrapper" id="edit-actions"><input type="submit" id="edit-submit" name="op" value="Go" class="form-submit" /></div><input type="hidden" name="form_build_id" value="form-mZ98hnE_3pV2aRqTe8S1Wq6ELFtnMAT1DVSxMEeJxl8" />
+<input type="hidden" name="form_id" value="search_block_form" />
+</div>
+</div></form> </div>
+ </div>
+ <nav id="primary">
+ <div id='branding' class='limiter'>
+ <div id="main-menu" class="navigation">
+ <h2 class="element-invisible">Main menu</h2><ul id="main-menu-links" class="links clearfix"><li class="menu-2685 first"><a href="/" title="">Home</a></li>
+<li class="menu-2627"><a href="/about" title="About USENIX">About</a></li>
+<li class="menu-1121"><a href="/conferences" title="">Conferences</a></li>
+<li class="menu-22927"><a href="/publications" title="Publications from USENIX">Publications</a></li>
+<li class="menu-2737"><a href="/lisa" title="LISA">LISA SIG</a></li>
+<li class="menu-2885"><a href="/membership-services" title="Membership &amp; Services">Membership &amp; Services</a></li>
+<li class="menu-2881"><a href="/students" title="Student Programs">Students</a></li>
+<li class="menu-2844 last"><a href="/store" title="">Store</a></li>
+</ul> </div> <!-- /#main-menu -->
+ </div>
+ <nav id="social-menu">
+ <div id="facebook"><a href="https://www.facebook.com/pages/USENIX-Association/124487434386" target="_blank">Facebook</a></div> <div id="googleplus"><a href="https://plus.google.com/108588319090208187909" target="_blank">Google Plus</a></div> <div id="twitter"><a href="http://twitter.com/usenix" target="_blank">Twitter</a></div> <div id="linkedin"><a href="http://www.linkedin.com/groups?home=&gid=49559" target="_blank">LinkedIn</a></div> <div id="youtube"><a href="http://www.youtube.com/user/USENIXAssociation" target="_blank">YouTube</a></div> </nav>
+ <nav id="donate-button">
+ <div id="donatebutton"><a style="border-bottom: none;" href="https://co.clickandpledge.com/advanced/default.aspx?wid=73860" target="_blank"><img src="/sites/all/themes/waves/css/images/donate-button.png"></a></div>
+ </nav>
+ </nav>
+ </div>
+</div></div>
+
+
+
+<div id="container">
+ <h2 class="element-invisible">You are here</h2><div class="breadcrumb"><a href="/">Home</a> Âť <strong>Page not found</strong></div>
+ <div id='page' class='clearfix limiter page-content'>
+
+ <div id='content' class=
+ 'no-sidebars'
+ >
+
+ <h1 class='page-title '>
+ Page not found </h1>
+
+
+ <div id='tabs'>
+ </div>
+
+ <div class='content-wrapper clearfix'>
+ <div class="region region-content">
+
+<div class='block-system block-page-content clearfix' id="block-system-main">
+
+
+
+
+ <div class="block-content clearfix"><p>Sorry, the page that you have requested is not available on this server. Perhaps:</p> <ul> <li>The link you are looking for has moved. If you have arrived here from another page, please back up and send mail to the page owner.</li> <li>You are looking for a conference that has happened. See the <a href="/conferences/all">Conferences</a> page.</li> <li>The link you are looking for has been withdrawn. If this has happened, and we are not providing some information that you need, please send mail to <a href="mailto:webster@usenix.org">webster@usenix.org</a>.</li> <p>&nbsp;</p> </ul></div>
+
+ </div>
+
+ </div>
+ </div>
+
+ </div> <!-- /#content -->
+ </div>
+</div>
+
+ <div id="footer-outer"><div id='footer' class='limiter clearfix'>
+
+ <div id='footer-region'>
+ <div class="region region-footer">
+
+<div class='block block-block clearfix' id="block-block-27">
+
+
+
+
+ <div class="block-content clearfix prose"><div style="float: right;">
+<p style="text-align: center;">Š USENIX 2016</p>
+</div></div>
+
+ </div>
+
+
+<div class='block block-menu clearfix' id="block-menu-menu-footer">
+
+
+
+
+ <div class="block-content clearfix"><ul class="menu"><li class="first leaf"><a href="/privacy-statement">Privacy Statement</a></li>
+<li class="leaf"><a href="/contact" title="">Contact Us</a></li>
+<li class="last leaf"><a href="/blog/rss.xml" title="">USENIX Update RSS Feed</a></li>
+</ul></div>
+
+ </div>
+
+ </div>
+ </div>
+ </div></div>
+
+
+<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","licenseKey":"d823139095","applicationID":"509444","transactionName":"YVJVZksCXkEEVhIMWFgYdlFNCl9cSkAVAFlfT2hAXAdZQABWEhZoWFhDbV8MRVwB","queueTime":0,"applicationTime":197,"ttGuid":"","agentToken":"","atts":"TRVWEAMYTU8=","errorBeacon":"bam.nr-data.net","agent":""}</script></body>
+</html>
diff --git a/reference/Error Handling is Ocassionally Correct_files/new_usenix.jpg b/reference/Error Handling is Ocassionally Correct_files/new_usenix.jpg
new file mode 100644
index 00000000..5815ff36
--- /dev/null
+++ b/reference/Error Handling is Ocassionally Correct_files/new_usenix.jpg
Binary files differ
diff --git a/reference/Files are hard.html b/reference/Files are hard.html
new file mode 100644
index 00000000..d8a91640
--- /dev/null
+++ b/reference/Files are hard.html
@@ -0,0 +1,484 @@
+<!DOCTYPE html>
+<!-- saved from url=(0035)http://danluu.com/file-consistency/ -->
+<html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+ <meta charset="utf-8">
+ <title>Files are hard</title>
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+<style>
+ img { max-width: 100%; height: auto; }
+ pre { max-width: 100%; height: auto; white-space: pre-wrap }
+ .link-left {
+ float: left;
+ padding-right: 1em;
+ padding-bottom: 1em;
+ }
+ .link-right {
+ float: right;
+ padding-bottom: 1em;
+ }
+ .navi-parent {
+ display: flex
+ }
+ .navi {
+ padding-bottom: 1em;
+ }
+ .navi-right {
+ margin-left: auto;
+ padding-bottom: 1em;
+ }
+ .navi div {
+ display: inline;
+ font-style: italic;
+ padding-left: 1em;
+ }
+ .navi-right div {
+ display: inline;
+ font-style: italic;
+ padding-right: 1em;
+ }
+}
+</style>
+ <meta name="description" content="">
+ <meta name="generator" content="Hugo 0.15">
+ </head>
+ <body>
+ <div id="wrapper">
+ <div class="container">
+
+
+ <div id="article">
+ <div class="article-title"><strong>Files are hard</strong></div>
+ <hr>
+ <div class="post">
+
+
+<p>I haven’t used a desktop email client in years. None of them could handle the volume of email I get without at least occasionally corrupting my mailbox. Pine, eudora, and outlook have all corrupted my inbox, forcing me to restore from backup. How is it that desktop mail clients are less reliable than gmail, even though my gmail account not only handles more email than I ever had on desktop clients, but also allows simultaneous access from multiple locations across the globe? Distributed systems have an unfair advantage, in that they can be robust against total disk failure in a way that desktop clients can’t, but none of the file corruption issues I’ve had have been from total disk failure. Why has my experience with desktop applications been so bad?</p>
+
+<p>Well, what sort of failures can occur? Crash consistency (maintaining consistent state even if there’s a crash) is probably the easiest property to consider, since we can assume that everything, from the filesystem to the disk, works correctly; let’s consider that first.</p>
+
+<h3 id="crash-consistency">Crash Consistency</h3>
+
+<p>Pillai et al. had a <a href="https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-pillai.pdf">paper</a> and <a href="https://www.usenix.org/sites/default/files/conference/protected-files/osdi14_slides_pillai.pdf">presentation</a> at OSDI ‘14 on exactly how hard it is to save data without corruption or data loss.</p>
+
+<p>Let’s look at a simple example of what it takes to save data in a way that’s robust against a crash. Say we have a file that contains the text <code>a foo</code> and we want to update the file to contain <code>a bar</code>. The pwrite function looks like it’s designed for this exact thing. It takes a file descriptor, what we want to write, a length, and an offset. So we might try</p>
+
+<pre><code>pwrite([file], “bar”, 3, 2) // write 3 bytes at offset 2
+</code></pre>
+
+<p>What happens? If nothing goes wrong, the file will contain <code>a bar</code>, but if there’s a crash during the write, we could get <code>a boo</code>, <code>a far</code>, or any other combination. Note that you may want to consider this an example over sectors or blocks and not chars/bytes.</p>
+
+<p>If we want atomicity (so we either end up with <code>a foo</code> or <code>a bar</code> but nothing in between) one standard technique is to make a copy of the data we’re about to change in an <a href="http://www.cburch.com/cs/340/reading/log/index.html">undo log</a> file, modify the “real” file, and then delete the log file. If a crash happens, we can recover from the log. We might write something like</p>
+
+<pre><code>creat(/dir/log);
+write(/dir/log, “2,3,foo”, 7);
+pwrite(/dir/orig, “bar”, 3, 2);
+unlink(/dir/log);
+</code></pre>
+
+<p>This should allow recovery from a crash without data corruption via the undo log, at least if we’re using <code>ext3</code> and we made sure to mount our drive with <code>data=journal</code>. But we’re out of luck if, like most people, we’re using the default<sup class="footnote-ref" id="fnref:D"><a rel="footnote" href="http://danluu.com/file-consistency/#fn:D">1</a></sup> – with the default <code>data=ordered</code>, the <code>write</code> and <code>pwrite</code> syscalls can be reordered, causing the write to <code>orig</code> to happen before the write to the log, which defeats the purpose of having a log. We can fix that.</p>
+
+<pre><code>creat(/dir/log);
+write(/dir/log, “2, 3, foo”);
+fsync(/dir/log); // don’t allow write to be reordered past pwrite
+pwrite(/dir/orig, 2, “bar”);
+fsync(/dir/orig);
+unlink(/dir/log);
+</code></pre>
+
+<p>That should force things to occur in the correct order, at least if we’re using ext3 with <code>data=journal</code> or <code>data=ordered</code>. If we’re using <code>data=writeback</code>, a crash during the the <code>write</code> or <code>fsync</code> to log can leave <code>log</code> in a state where the filesize has been adjusted for the write of “bar”, but the data hasn’t been written, which means that the log will contain random garbage. This is because with <code>data=writeback</code>, metadata is <a href="https://en.wikipedia.org/wiki/Journaling_file_system">journaled</a>, but data operations aren’t, which means that data operations (like writing data to a file) aren’t ordered with respect to metadata operations (like adjusting the size of a file for a write).</p>
+
+<p>We can fix that by adding a checksum to the log file when creating it. If the contents of <code>log</code> don’t contain a valid checksum, then we’ll know that we ran into the situation described above.</p>
+
+<pre><code>creat(/dir/log);
+write(/dir/log, “2, 3, [checksum], foo”); // add checksum to log file
+fsync(/dir/log);
+pwrite(/dir/orig, 2, “bar”);
+fsync(/dir/orig);
+unlink(/dir/log);
+</code></pre>
+
+<p>That’s safe, at least on current configurations of ext3. But it’s legal for a filesystem to end up in a state where the log is never created unless we issue an fsync to the parent directory.</p>
+
+<pre><code>creat(/dir/log);
+write(/dir/log, “2, 3, [checksum], foo”);
+fsync(/dir/log);
+fsync(/dir); // fsync parent directory of log file
+pwrite(/dir/orig, 2, “bar”);
+fsync(/dir/orig);
+unlink(/dir/log);
+</code></pre>
+
+<p>That should prevent corruption on any Linux filesystem, but if we want to make sure that the file actually contains “bar”, we need another fsync at the end.</p>
+
+<pre><code>creat(/dir/log);
+write(/dir/log, “2, 3, [checksum], foo”);
+fsync(/dir/log);
+fsync(/dir);
+pwrite(/dir/orig, 2, “bar”);
+fsync(/dir/orig);
+unlink(/dir/log);
+fsync(/dir);
+</code></pre>
+
+<p>That results in consistent behavior and guarantees that our operation actually modifies the file after it’s completed, as long as we assume that <code>fsync</code> actually flushes to disk. OS X and some versions of ext3 have an fsync that doesn’t really flush to disk. OS X requires <code>fcntl(F_FULLFSYNC)</code> to flush to disk, and some versions of ext3 only flush to disk if the the <a href="https://en.wikipedia.org/wiki/Inode">inode</a> changed (which would only happen at most once a second on writes to the same file, since the inode mtime has one second granularity), as an optimization.</p>
+
+<p>Even if we assume fsync issues a flush command to the disk, some disks ignore flush directives for the same reason fsync is gimped on OS X and some versions of ext3 – to look better in benchmarks. Handling that is beyond the scope of this post, but the <a href="http://www.researchgate.net/profile/Vijay_Chidambaram/publication/220958003_Coerced_Cache_Eviction_and_discreet_mode_journaling_Dealing_with_misbehaving_disks/links/54d0f0190cf29ca811040c8a.pdf">Rajimwale et al. DSN ‘11 paper</a> and related work cover that issue.</p>
+
+<h3 id="filesystem-semantics">Filesystem semantics</h3>
+
+<p>When the authors examined ext2, ext3, ext4, btrfs, and xfs, they found that there are substantial differences in how code has to be written to preserve consistency. They wrote a tool that collects block-level filesystem traces, and used that to determine which properties don’t hold for specific filesystems. The authors are careful to note that they can only determine when properties don’t hold – if they don’t find a violation of a property, that’s not a guarantee that the property holds.</p>
+
+<p><img src="./Files are hard_files/fs_properties.png" alt="Different filesystems have very different properties"></p>
+
+<p>Xs indicate that a property is violated. The atomicity properties are basically what you’d expect, e.g., no X for single sector overwrite means that writing a single sector is atomic. The authors note that the atomicity of single sector overwrite sometimes comes from a property of the disks they’re using, and that running these filesystems on some disks won’t give you single sector atomicity. The ordering properties are also pretty much what you’d expect from their names, e.g., an X in the “Overwrite -&gt; Any op” row means that an overwrite can be reordered with some operation.</p>
+
+<p>After they created a tool to test filesystem properties, they then created a tool to check if any applications rely on any potentially incorrect filesystem properties. Because invariants are application specific, the authors wrote checkers for each application tested.</p>
+
+<p><img src="./Files are hard_files/program_bugs.png" alt="Everything is broken"></p>
+
+<p>The authors find issues with most of the applications tested, including things you’d really hope would work, like LevelDB, HDFS, Zookeeper, and git. In a talk, one of the authors noted that the developers of sqlite have a very deep understanding of these issues, but even that wasn’t enough to prevent all bugs. That speaker also noted that version control systems were particularly bad about this, and that the developers had a pretty lax attitude that made it very easy for the authors to find a lot of issues in their tools.
+The most common class of error was incorrectly assuming ordering between syscalls. The next most common class of error was assuming that syscalls were atomic<sup class="footnote-ref" id="fnref:A"><a rel="footnote" href="http://danluu.com/file-consistency/#fn:A">2</a></sup>. These are fundamentally the same issues people run into when doing multithreaded programming. Correctly reasoning about re-ordering behavior and inserting barriers correctly is hard. But even though shared memory concurrency is considered a hard problem that requires great care, writing to files isn’t treated the same way, even though it’s actually harder in a number of ways.</p>
+
+<p>Something to note here is that while btrfs’s semantics aren’t inherently less relaible than ext3/ext4, many more applications corrupt data on top of btrfs because developers aren’t used to coding against filesystems that allow directory operations to be reordered (ext2 is perhaps the most recent widely used filesystem that allowed that reordering). We’ll probably see a similar level of bug exposure when people start using NVRAM drives that have byte-level atomicity. People almost always just run some tests to see if things work, rather than making sure they’re coding against what’s legal in a POSIX filesystem.</p>
+
+<p>Hardware memory ordering semantics are usually <a href="http://danluu.com/new-cpu-features/#memory-concurrency">well documented</a> in a way that makes it simple to determine precisely which operations can be reordered with which other operations, and which operations are atomic. By contrast, here’s <a href="http://man7.org/linux/man-pages/man5/ext4.5.html">the ext manpage</a> on its three data modes:</p>
+
+<blockquote>
+<p>journal: All data is committed into the journal prior to being written into the main filesystem.
+ordered: This is the default mode. All data is forced directly out to the main file system prior to its metadata being committed to the journal.
+writeback: Data ordering is not preserved – data may be written into the main filesystem after its metadata has been committed to the journal. <strong>This is rumoured to be</strong> the highest-throughput option. It guarantees internal filesystem integrity, however it can allow old data to appear in files after a crash and journal recovery.</p>
+</blockquote>
+
+<p>The manpage literally refers to rumor. This is the level of documentation we have. If we look back at our example where we had to add an <code>fsync</code> between the <code>write(/dir/log, “2, 3, foo”)</code> and <code>pwrite(/dir/orig, 2, “bar”)</code> to prevent reordering, I don’t think the necessity of the <code>fsync</code> is obvious from the description in the manpage. If you look at the hardware memory ordering “manpage” above, it specifically defines the ordering semantics, and it certainly doesn’t rely on rumor.</p>
+
+<p>This isn’t to say that filesystem semantics aren’t documented anywhere. Between <a href="http://lwn.net/">lwn</a> and LKML, it’s possible to get a good picture of how things work. But digging through all of that is hard enough that it’s still quite common <a href="http://austingroupbugs.net/view.php?id=672">for there to be long, uncertain discussions on how things work</a>. A lot of the information out there is wrong, and even when information was right at the time it was posted, it often goes out of date.</p>
+
+<p>When digging through archives, I’ve often seen a post from 2005 cited to back up the claim that OS X <code>fsync</code> is the same as Linux <code>fsync</code>, and that OS X <code>fcntl(F_FULLFSYNC)</code> is even safer than anything available on Linux. Even at the time, I don’t think that was true for the 2.4 kernel, although it was true for the 2.6 kernel. But since 2008 or so Linux 2.6 with ext3 will do a full flush to disk for each fsync (if the disk supports it, and the filesystem hasn’t been specially configured with barriers off).</p>
+
+<p>Another issue is that you often also see exchanges <a href="http://lkml.iu.edu/hypermail/linux/kernel/0908.3/01481.html">like this one</a>:</p>
+
+<p><strong>Dev 1</strong>: Personally, I care about metadata consistency, and ext3 documentation suggests that journal protects its integrity. Except that it does not on broken storage devices, and you stil need to run fsck there.<br>
+<strong>Dev 2</strong>: as the ext3 authors have stated many times over the years, you still need to run fsck periodicly anyway.<br>
+<strong>Dev 1</strong>: Where is that documented?<br>
+<strong>Dev 2</strong>: linux-kernel mailing list archives.<br>
+<strong>Dev 3</strong>: Probably from some 6-8 years ago, in e-mail postings that I made.<br></p>
+
+<p>Where’s this documented? Oh, in some mailing list post 6-8 years ago (which makes it 12-14 years from today). I don’t mean to pick on filesystem devs. The fs devs whose posts I’ve read are quite polite compared to LKML’s reputation; they generously spend a lot of their time responding to basic questions and I’m impressed by how patient the expert fs devs are with askers, but it’s hard for outsiders to troll through a decade and a half of mailing list postings to figure out which ones are still valid and which ones have been obsoleted!</p>
+
+<p>In their OSDI 2014 talk, the authors of the paper we’re discussing noted that when they reported bugs they’d found, developers would often respond “POSIX doesn’t let filesystems do that”, without being able to point to any specific POSIX documentation to support their statement. If you’ve followed Kyle Kingsbury’s Jepsen work, this may sound familiar, except devs respond with “filesytems don’t do that” instead of “networks don’t do that”.I think this is understandable, given how much misinformation is out there. Not being a filesystem dev myself, I’d be a bit surprised if I don’t have at least one bug in this post.</p>
+
+<h3 id="filesystem-correctness">Filesystem correctness</h3>
+
+<p>We’ve already encountered a lot of complexity in saving data correctly, and this only scratches the surface of what’s involved. So far, we’ve assumed that the disk works properly, or at least that the filesystem is able to detect when the disk has an error via <a href="https://en.wikipedia.org/wiki/S.M.A.R.T.">SMART</a> or some other kind of monitoring. I’d always figured that was the case until I started looking into it, but that assumption turns out to be completely wrong.</p>
+
+<p>The <a href="http://research.cs.wisc.edu/wind/Publications/iron-sosp05.pdf">Prabhakaran et al. SOSP 05 paper</a> examined how filesystems respond to disk errors in some detail. They created a fault injection layer that allowed them to inject disk faults and then ran things like <code>chdir</code>, <code>chroot</code>, <code>stat</code>, <code>open</code>, <code>write</code>, etc. to see what would happen.</p>
+
+<p>Between ext3, reiserfs, and NTFS, reiserfs is the best at handling errors and it seems to be the only filesystem where errors were treated as first class citizens during design. It’s mostly consistent about propagating errors to the user on reads, and calling <code>panic</code> on write failures, which triggers a restart and recovery. This general policy allows the filesystem to gracefully handle read failure and avoid data corruption on write failures. However, the authors found a number of inconsistencies and bugs. For example, reiserfs doesn’t correctly handle read errors on indirect blocks and leaks space, and a specific type of write failure doesn’t prevent reiserfs from updating the journal and committing the transaction, which can result in data corruption.</p>
+
+<p>Reiserfs is the good case. The authors found that ext3 ignored write failures in most cases, and rendered the filesystem read-only in most cases for read failures. This seems like pretty much the opposite of the policy you’d want. Ignoring write failures can easily result in data corruption, and remounting the filesystem as read-only is a drastic overreaction if the read error was a transient error (transient errors are common). Additionally, ext3 did the least consistency checking of the three filesystems and was the most likely to not detect an error. In one presentation, one of the authors remarked that the ext3 code had lots of comments like “I really hope a write error doesn’t happen here” in places where errors weren’t handled.</p>
+
+<p>NTFS is somewhere in between. The authors found that it has many consistency checks built in, and is pretty good about propagating errors to the user. However, like ext3, it ignores write failures.</p>
+
+<p>The paper has much more detail on the exact failure modes, but the details are mostly of historical interest as many of the bugs have been fixed.</p>
+
+<p>It would be really great to see an updated version of the paper, and in one presentation someone in the audience asked if there was more up to date information. The presenter replied that they’d be interested in knowing what things look like now, but that it’s hard to do that kind of work in academia because grad students don’t want to repeat work that’s been done before, which is pretty reasonable given the incentives they face. Doing replications is a lot of work, often nearly as much work as the original paper, and replications usually give little to no academic credit. This is one of the many cases where the incentives align very poorly with producing real world impact.</p>
+
+<p>The <a href="http://usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html">Gunawi et al. FAST 08</a> is another paper it would be great to see replicated today. That paper follows up the paper we just looked at, and examines the error handling code in different file systems, using a simple static analysis tool to find cases where errors are being thrown away. Being thrown away is defined very loosely in the paper — code like the following</p>
+
+<pre><code>if (error) {
+ printk(“I have no idea how to handle this error\n”);
+}
+</code></pre>
+
+<p>is considered <em>not</em> throwing away the error. Errors are considered to be ignored if the execution flow of the program doesn’t depend on the error code returned from a function that returns an error code.</p>
+
+<p>With that tool, they find that most filesystems drop a lot of error codes:</p>
+
+<p></p><p></p>
+
+<p><br></p><div align="CENTER"><p></p>
+
+<p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><table cellpadding="3" border="1" align="CENTER">
+
+<tbody><tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1"></font></p></td>
+
+<td align="CENTER" colspan="2"><font size="-1"> </font><font size="-1"><b>By % Broken</b></font></td>
+
+<td align="CENTER" colspan="2"><font size="-1"> </font><font size="-1"><b>By Viol/Kloc</b></font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">Rank </font></p></td>
+
+<td align="LEFT"><font size="-1"> FS </font></td>
+
+<td align="RIGHT"><font size="-1"> Frac. </font></td>
+
+<td align="LEFT" colspan="2"><font size="-1"> FS&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Viol/Kloc</font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">1 </font></p></td>
+
+<td align="LEFT"><font size="-1"> IBM JFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 24.4 </font></td>
+
+<td align="LEFT"><font size="-1"> ext3 </font></td>
+
+<td align="RIGHT"><font size="-1"> 7.2 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">2 </font></p></td>
+
+<td align="LEFT"><font size="-1"> ext3 </font></td>
+
+<td align="RIGHT"><font size="-1"> 22.1 </font></td>
+
+<td align="LEFT"><font size="-1"> IBM JFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 5.6 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">3 </font></p></td>
+
+<td align="LEFT"><font size="-1"> JFFS v2 </font></td>
+
+<td align="RIGHT"><font size="-1"> 15.7 </font></td>
+
+<td align="LEFT"><font size="-1"> NFS Client </font></td>
+
+<td align="RIGHT"><font size="-1"> 3.6 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">4 </font></p></td>
+
+<td align="LEFT"><font size="-1"> NFS Client </font></td>
+
+<td align="RIGHT"><font size="-1"> 12.9 </font></td>
+
+<td align="LEFT"><font size="-1"> VFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 2.9 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">5 </font></p></td>
+
+<td align="LEFT"><font size="-1"> CIFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 12.7 </font></td>
+
+<td align="LEFT"><font size="-1"> JFFS v2 </font></td>
+
+<td align="RIGHT"><font size="-1"> 2.2 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">6 </font></p></td>
+
+<td align="LEFT"><font size="-1"> MemMgmt </font></td>
+
+<td align="RIGHT"><font size="-1"> 11.4 </font></td>
+
+<td align="LEFT"><font size="-1"> CIFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 2.1 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">7 </font></p></td>
+
+<td align="LEFT"><font size="-1"> ReiserFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 10.5 </font></td>
+
+<td align="LEFT"><font size="-1"> MemMgmt </font></td>
+
+<td align="RIGHT"><font size="-1"> 2.0 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">8 </font></p></td>
+
+<td align="LEFT"><font size="-1"> VFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 8.4 </font></td>
+
+<td align="LEFT"><font size="-1"> ReiserFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 1.8 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">9 </font></p></td>
+
+<td align="LEFT"><font size="-1"> NTFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 8.1 </font></td>
+
+<td align="LEFT"><font size="-1"> XFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 1.4 </font></td>
+
+</tr>
+
+<tr><td align="CENTER"><font size="-1"><p></p>
+
+</font><p><font size="-1">10 </font></p></td>
+
+<td align="LEFT"><font size="-1"> XFS </font></td>
+
+<td align="RIGHT"><font size="-1"> 6.9 </font></td>
+
+<td align="LEFT"><font size="-1"> NFS Server </font></td>
+
+<td align="RIGHT"><font size="-1"> 1.2 </font></td>
+
+</tr>
+
+</tbody></table><p></p>
+
+<p></p></div><p></p>
+
+<p><br></p>
+
+<p><a name="table-analysis-robust"></a></p>
+
+<p>Comments they found next to ignored errors include: “Should we pass any errors back?”, “Error, skip block and hope for the best.”, “There’s no way of reporting error returned from ext3_mark_inode_dirty() to userspace. So ignore it.“, “Note: todo: log error handler.“, “We can’t do anything about an error here.”, “Just ignore errors at this point. There is nothing we can do except to try to keep going.”, “Retval ignored?”, and “Todo: handle failure.”</p>
+
+<p>One thing to note is that in a lot of cases, ignoring an error is more of a symptom of an architectural issue than a bug per se (e.g., ext3 ignored write errors during checkpointing because it didn’t have any kind of recovery mechanism). But even so, the authors of the papers found many real bugs.</p>
+
+<h3 id="error-recovery">Error recovery</h3>
+
+<p>Every widely used filesystem has bugs that will cause problems on error conditions, which brings up two questions. Can recovery tools robustly fix errors, and how often do errors occur? How do they handle recovery from those problems? The <a href="http://usenix.org/legacy/events/osdi08/tech/full_papers/gunawi/gunawi_html/index.html">Gunawi et al. OSDI 08 paper</a> looks at that and finds that fsck, a standard utility for checking and repairing file systems, “checks and repairs certain pointers in an incorrect order … the file system can even be unmountable after”.</p>
+
+<p>At this point, we know that it’s quite hard to write files in a way that ensures their robustness even when the underlying filesystem is correct, the underlying filesystem will have bugs, and that attempting to repair corruption to the filesystem may damage it further or destroy it. How often do errors happen?</p>
+
+<h3 id="error-frequency">Error frequency</h3>
+
+<p>The <a href="http://bnrg.eecs.berkeley.edu/~randy/Courses/CS294.F07/11.1.pdf">Bairavasundaram et al. SIGMETRICS ‘07 paper</a> found that, depending on the exact model of disk, between 5% and 20% of would have at least one error over a two year period. Interestingly, many of these were isolated errors – 38% of disks with errors had only a single error, and 80% had fewer than 50 errors. <a href="https://www.usenix.org/legacy/events/fast08/tech/full_papers/bairavasundaram/bairavasundaram_html/main.html">A follow-up study</a> looked at corruption and found that silent data corruption that was only detected by checksumming happened on .5% of disks per year, with one extremely bad model showing corruption on 4% of disks in a year.</p>
+
+<p>It’s also worth noting that they found very high locality in error rates between disks on some models of disk. For example, there was one model of disk that had a very high error rate in one specific sector, making many forms of RAID nearly useless for redundancy.</p>
+
+<p>That’s another study it would be nice to see replicated. <a href="https://www.backblaze.com/blog/hard-drive-reliability-q3-2015/">Most studies on disk focus on the failure rate of the entire disk</a>, but if what you’re woried about is data corruption, errors in non-failed disks are more worrying than disk failure, which is easy to detect and mitigate.</p>
+
+<h3 id="conclusion">Conclusion</h3>
+
+<p>Files are hard. <a href="http://danluu.com/butler-lampson-1999/#parallelism">Butler Lampson has remarked</a> that when they came up with threads, locks, and condition variables at PARC, they thought that they were creating a programming model that anyone could use, but that there’s now decades of evidence that they were wrong. We’ve accumulated a lot of evidence that humans are very bad at reasoning at these kinds of problems, which are very similar to the problems you have when writing correct code to interact with current filesystems. Lampson suggests that the best known general purpose solution is to package up all of your parallelism into as small a box as possible and then have a wizard write the code in the box. Translated to filesystems, that’s equivalent to saying that as an application developer, writing to files safely is hard enough that it should be done via some kind of library and/or database, not by directly making syscalls.</p>
+
+<p>Sqlite is quite good in terms of reliability if you want a good default. However, some people find it to be too heavyweight if all they want is a file-based abstraction. What they really want is a sort of polyfill for the file abstraction that works on top of all filesystems without having to understand the differences between different configurations (and even different versions) of each filesystem. Since that doesn’t exist yet, when no existing library is sufficient, you need to checksum your data since you will get silent errors and corruption. The only questions are whether or not you detect the errors and whether or not your record format only destroys a single record when corruption happens, or if it destroys the entire database. As far as I can tell, most desktop email client developers have chosen to go the route of destroying all of your email if corruption happens.</p>
+
+<p>These studies also hammer home the point that <a href="http://danluu.com/everything-is-broken/">conventional testing isn’t sufficient</a>. There were multiple cases where the authors of a paper wrote a relatively simple tool and found a huge number of bugs. You don’t need any deep computer science magic to write the tools. The error propagation checker from the paper that found a ton of bugs in filesystem error handling was 4k LOC. If you read the paper, you’ll see that the authors observed that the tool had a very large number of shortcomings because of its simplicity, but despite those shortcomings, it was able to find a lot of real bugs. I wrote a vaguely similar tool at my last job to enforce some invariants, and it was literally two pages of code. It didn’t even have a real parser (it just went line-by-line through files and did some regexp matching to detect the simple errors that it’s possible to detect with just a state machine and regexes), but it found enough bugs that it paid for itself in development time the first time I ran it.</p>
+
+<p>Almost every software project I’ve seen has a lot of low hanging testing fruit. Really basic <a href="http://danluu.com/testing/">random testing</a>, <a href="http://danluu.com/pl-troll/">static analysis</a>, and <a href="https://aphyr.com/tags/jepsen">fault injection</a> can pay for themselves in terms of dev time pretty much the first time you use them.</p>
+
+<h3 id="appendix">Appendix</h3>
+
+<p>I’ve probably covered less than 20% of the material in the papers I’ve referred to here. Here’s a bit of info about some other neat info you can find in those papers, and others.</p>
+
+<p><a href="https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-pillai.pdf">Pillai et al., OSDI ‘14</a>: this papers goes into much more detail about what’s required for crash consistency than this post does. It also gives a fair amount of detail about how exactly applications fail, including diagrams of traces that indicate what false assumptions are embedded in each trace.</p>
+
+<p><a href="http://static.usenix.org/legacy/events/fast12/tech/full_papers/Chidambaram.pdf">Chidambara et al., FAST ‘12</a>: the same filesystem primitives are responsible for both consistency and ordering. The authors propose alternative primitives that seperate these concerns, allow better performance while maintaining safety.</p>
+
+<p><a href="https://www.researchgate.net/publication/220958003_Coerced_Cache_Eviction_and_Discreet_Mode_Journaling_Dealing_with_Misbehaving_Disks">Rajimwale et al. DSN ‘01</a>: you probably shouldn’t use disks that ignore flush directives, but in case you do, here’s a protocol that forces those disks to flush using normal filesystem operations. As you might expect, the performance for this is quite bad.</p>
+
+<p><a href="http://research.cs.wisc.edu/wind/Publications/iron-sosp05.pdf">Prabhakaran et al. SOSP ‘05</a>: This has a lot more detail on filesystem responses to error than was covered in this post. The authors also discuss JFS, an IBM filesystem for AIX. Although it was designed for high reliability systems, it isn’t particularly more reliable than the alternatives. Related material is covered furtehr in <a href="http://research.cs.wisc.edu/adsl/Publications/pointer-dsn08.pdf">DSN ‘08</a>, <a href="http://research.cs.wisc.edu/adsl/Publications/trust-storagess06.pdf">StorageSS ‘06</a>, <a href="http://research.cs.wisc.edu/adsl/Publications/vmdep-dsn06.pdf">DSN ‘06</a>, <a href="http://research.cs.wisc.edu/adsl/Publications/parity-fast08.pdf">FAST ‘08</a>, and <a href="http://research.cs.wisc.edu/adsl/Publications/envyfs-usenix09.pdf">USENIX ‘09</a>, among others.</p>
+
+<p><a href="http://usenix.org/legacy/event/fast08/tech/full_papers/gunawi/gunawi_html/index.html">Gunawi et al. FAST ‘08</a> : Again, much more detail than is covered in this post on when errors get dropped, and how they wrote their tools. They also have some call graphs that give you one rough measure of the complexity involved in a filesystem. The XFS call graph is particularly messy, and one of the authors noted in a presentation that an XFS developer said that XFS was fun to work on since they took advantage of every possible optimization opportunity regardless of how messy it made things.</p>
+
+<p><a href="http://bnrg.eecs.berkeley.edu/~randy/Courses/CS294.F07/11.1.pdf">Bairavasundaram et al. SIGMETRICS ‘07</a>: There’s a lot of information on disk error locality and disk error probability over time that isn’t covered in this post. <a href="http://research.cs.wisc.edu/adsl/Publications/corruption-fast08.pdf">A followup paper in FAST08 has more details</a>.</p>
+
+<p><a href="http://usenix.org/legacy/events/osdi08/tech/full_papers/gunawi/gunawi_html/index.html">Gunawi et al. OSDI ‘08</a>: This paper has a lot more detail about when fsck doesn’t work. In a presentation, one of the authors mentioned that fsck is the only program that’s ever insulted him. Apparently, if you have a corrupt pointer that points to a superblock, fsck destroys the superblock (possibly rendering the disk unmountable), tells you something like “you dummy, you must have run fsck on a mounted disk”, and then gives up. In the paper, the authors reimplement basically all of fsck using a declarative model, and find that the declarive version is shorter, easier to understand, and much easier to extend, at the cost of being somewhat slower.</p>
+
+<p>Memory errors are beyond the scope of this post, but <a href="http://danluu.com/why-ecc/">memory corruption</a> can cause disk corruption. This is especially annoying because memory corruption can cause you to take a checksum of bad data and write a bad checksum. It’s also possible to corrupt in memory pointers, which often results in something very bad happening. See the <a href="http://research.cs.wisc.edu/adsl/Publications/zfs-corruption-fast10.pdf">Zhang et al. FAST ‘10 paper</a> for more on how ZFS is affected by that. There’s a meme going around that ZFS is safe against memory corruption because it checksums, but that paper found that critical things held in memory aren’t checksummed, and that memory errors can cause data corruption in real scenarios.</p>
+
+<p>The sqlite devs are serious about both <a href="https://www.sqlite.org/howtocorrupt.html">documentation</a> and <a href="https://www.sqlite.org/testing.html">testing</a>. If I wanted to write a reliable desktop application, I’d start by reading the sqlite docs and then talking to some of the core devs. If I wanted to write a reliable distributed application I’d start by getting a job at Google and then reading the design docs and <a href="http://danluu.com/postmortem-lessons/">postmortems</a> for GFS, Colossus, Spanner, etc. J/k, but not really.</p>
+
+<p>We haven’t looked at formal methods at all, but there have been a variety of attempts to formally verify properties of filesystems, such as <a href="https://sibylfs.github.io/">SibylFS</a>.</p>
+
+<p>This list isn’t intended to be exhaustive. It’s just a list of things I’ve read that I think are interesting.</p>
+
+<p><em>Update: many people have read this post and suggested that, in the first file example, you should use the much simpler protocol of copying the file to modified to a temp file, modifying the temp file, and then renaming the temp file to overwrite the original file. In fact, that’s probably the most common comment I’ve gotten on this post. If you think this solves the problem, I’m going to ask you to pause for five seconds and consider the problems this might have. First, you still need to fsync in multiple places. Second, you will get very poor performance with large files. People have also suggested using many small files to work around that problem, but that will also give you very poor performance unless you do something fairly exotic. Third, if there’s a hardlink, you’ve now made the problem of crash consistency much more complicated than in the original example. Fourth, you’ll lose file metadata, sometimes in ways that can’t be fixed up after the fact. That problem can, on some filesystems, be worked around with ioctls, but that only sometimes fixes the issue and now you’ve got fs specific code to preserve correctness even in the non-crash case. And that’s just the beginning. The fact that so many people thought that this was a simple solution to the problem demonstrates that this problem is one that people are prone to underestimating, even they’re explicitly warned that people tend to underestimate this problem!</em></p>
+
+<p><strong>If you liked this, you’ll probably enjoy <a href="http://danluu.com/cpu-bugs/">this post on cpu bugs</a>.</strong></p>
+
+<p><small>
+Thanks to Leah Hanson, Katerina Barone-Adesi, Jamie Brandon, Kamal Marhubi, David Turner, Benjamin Gilbert, Tom Murphy, Chris Ball, Joe Doliner, Alexy Romanov, Mindy Preston, Paul McJones, and Evan Jones for comments/discussion.
+</small></p>
+<div class="footnotes">
+
+<hr>
+
+<ol>
+<li id="fn:D">Turns out <a href="https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Storage_Administration_Guide/ch-ext3.html">some commercially supported distros</a> only support <code>data=ordered</code>. Oh, and when I said <code>data=ordered</code> was the default, that’s only the case if pre-2.6.30. After 2.6.30, there’s a config option, <code>CONFIG_EXT3_DEFAULTS_TO_ORDERED</code>. If that’s not set, the default becomes <code>data=writeback</code>.
+ <a class="footnote-return" href="http://danluu.com/file-consistency/#fnref:D"><sup>[return]</sup></a></li>
+
+<li id="fn:A"><p>Cases where overwrite atomicity is required were documented as known issues, and all such cases assumed single-block atomicity and not multi-block atomicity. By contrast, multiple applications (LevelDB, Mercurial, and HSQLDB) had bad data corruption bugs that came from assuming appends are atomic.</p>
+
+<p>That seems to be an indirect result of a commonly used update protocol, where modifications are logged via appends, and then logged data is written via overwrites. Application developers are careful to check for and handle errors in the actual data, but the errors in the log file are often overlooked.</p>
+
+<p>There are a number of other classes of errors discussed, and I recommend reading the paper for the details if you work on an application that writes files.</p>
+ <a class="footnote-return" href="http://danluu.com/file-consistency/#fnref:A"><sup>[return]</sup></a></li>
+</ol>
+</div>
+
+ </div>
+ </div>
+
+<div class="navi-parent">
+ <div class="navi"><div><a href="http://danluu.com/startup-tradeoffs/">← Big company vs. startup work and pay</a></div></div>
+ <div class="navi-right"><div><a href="http://danluu.com/why-ecc/">Should I buy ECC memory? →</a></div></div>
+ </div>
+ </div>
+ <footer>
+<div class="navi-parent">
+ <div class="navi">
+ <div><a href="http://danluu.com/">Archive</a></div>
+ <div><a href="http://danluu.com/blog/archives/popularity">Popular</a></div>
+ <div><a href="http://danluu.com/about">About (hire me!)</a></div>
+ </div>
+<div class="navi-right">
+ <div><a href="https://twitter.com/danluu">Twitter</a></div>
+ <div><a href="http://danluu.com/atom.xml" rel="subscribe-rss" title="subscribe via RSS">RSS</a></div>
+ </div>
+</div>
+ </footer>
+ <script type="text/javascript" src="http://www.google-analytics.com/ga.js"></script><script type="text/javascript">
+ var _gaq = [['_setAccount', 'UA-43852829-1'], ['_trackPageview']];
+ (function() {
+ var ga = document.createElement('script'); ga.type = 'text/javascript';
+ ga.src = '//www.google-analytics.com/ga.js';
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+ })();
+ </script>
+
+
+
+
+</div></body></html> \ No newline at end of file
diff --git a/reference/Files are hard_files/fs_properties.png b/reference/Files are hard_files/fs_properties.png
new file mode 100644
index 00000000..74230792
--- /dev/null
+++ b/reference/Files are hard_files/fs_properties.png
Binary files differ
diff --git a/reference/Files are hard_files/program_bugs.png b/reference/Files are hard_files/program_bugs.png
new file mode 100644
index 00000000..129a0b8b
--- /dev/null
+++ b/reference/Files are hard_files/program_bugs.png
Binary files differ
diff --git a/reference/Linux KAIO/History of Linux KAIO API.pdf b/reference/Linux KAIO/History of Linux KAIO API.pdf
new file mode 100644
index 00000000..2f725ab4
--- /dev/null
+++ b/reference/Linux KAIO/History of Linux KAIO API.pdf
Binary files differ
diff --git a/reference/Linux KAIO/KAIOUserGuide.htm b/reference/Linux KAIO/KAIOUserGuide.htm
new file mode 100644
index 00000000..9d197f12
--- /dev/null
+++ b/reference/Linux KAIO/KAIOUserGuide.htm
@@ -0,0 +1,733 @@
+<!DOCTYPE html>
+<!-- saved from url=(0050)https://code.google.com/p/kernel/wiki/AIOUserGuide -->
+<html><script>var gapi={plusone:{render:function(){},go:function(){}}};</script><script type="text/javascript" async="" src="https://apis.google.com/js/plusone.js"></script><script>var urchinTracker=function(){},_gaq={push:function(){try {if(arguments[0][0]=='_link')window.location.href=arguments[0][1]}catch(er){}}},_gat={_createTracker:function(){}, _getTracker:function(){return{__noSuchMethod__:function(){},_link:function(o){if(o)location.href=o;},_linkByPost:function(){return true;},_getLinkerUrl:function(o){return o;},_trackEvent:function(){}}}};</script><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+
+ <meta name="ROBOTS" content="NOARCHIVE">
+
+ <link rel="icon" type="image/vnd.microsoft.icon" href="https://ssl.gstatic.com/codesite/ph/images/phosting.ico">
+
+
+ <link rel="canonical" href="http://code.google.com/p/kernel/wiki/AIOUserGuide">
+
+ <script type="text/javascript">
+
+
+
+
+ var codesite_token = "ABZ6GAciBR7wXJANw0lEk1JMtMA3vIzrjA:1408361093458";
+
+
+ var CS_env = {"token": "ABZ6GAciBR7wXJANw0lEk1JMtMA3vIzrjA:1408361093458", "profileUrl": "/u/nialldouglas14/", "projectName": "kernel", "assetHostPath": "https://ssl.gstatic.com/codesite/ph", "domainName": null, "projectHomeUrl": "/p/kernel", "assetVersionPath": "https://ssl.gstatic.com/codesite/ph/13997016681179179006", "loggedInUserEmail": "nialldouglas14@gmail.com", "relativeBaseUrl": ""};
+ var _gaq = _gaq || [];
+ _gaq.push(
+ ['siteTracker._setAccount', 'UA-18071-1'],
+ ['siteTracker._trackPageview']);
+
+ _gaq.push(
+ ['projectTracker._setAccount', 'UA-26096441-1'],
+ ['projectTracker._trackPageview']);
+
+ (function() {
+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+ (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(ga);
+ })();
+
+ </script><script type="text/javascript" async="" src="https://ssl.google-analytics.com/ga.js"></script>
+
+
+ <title>AIOUserGuide -
+ kernel -
+
+ A description of how to use AIO -
+ Google production server Linux kernel development - Google Project Hosting
+ </title>
+ <link type="text/css" rel="stylesheet" href="./AIOUserGuide_files/core.css">
+
+ <link type="text/css" rel="stylesheet" href="./AIOUserGuide_files/ph_detail.css">
+
+
+
+ <link type="application/atom+xml" rel="alternate" href="https://code.google.com/feeds/p/kernel/gitchanges/basic?path=/AIOUserGuide.wiki&repo=wiki">
+
+
+<!--[if IE]>
+ <link type="text/css" rel="stylesheet" href="https://ssl.gstatic.com/codesite/ph/13997016681179179006/css/d_ie.css" >
+<![endif]-->
+ <style type="text/css">
+ .menuIcon.off { background: no-repeat url(https://ssl.gstatic.com/codesite/ph/images/dropdown_sprite.gif) 0 -42px }
+ .menuIcon.on { background: no-repeat url(https://ssl.gstatic.com/codesite/ph/images/dropdown_sprite.gif) 0 -28px }
+ .menuIcon.down { background: no-repeat url(https://ssl.gstatic.com/codesite/ph/images/dropdown_sprite.gif) 0 0; }
+
+
+ #maincol {
+ padding-top: 0;
+ padding-bottom: 0;
+ }
+
+
+ </style>
+<style type="text/css"></style></head>
+<body class="t6">
+<script type="text/javascript">
+ window.___gcfg = {lang: 'en'};
+ (function()
+ {var po = document.createElement("script");
+ po.type = "text/javascript"; po.async = true;po.src = "https://apis.google.com/js/plusone.js";
+ var s = document.getElementsByTagName("script")[0];
+ s.parentNode.insertBefore(po, s);
+ })();
+</script>
+<div class="headbg">
+
+ <div id="gaia">
+
+
+ <span>
+
+
+
+ <a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#" id="multilogin-dropdown" onclick="return false;"><u><b>nialldouglas14@gmail.com</b></u> <small>▼</small></a>
+
+
+ | <a href="https://code.google.com/u/nialldouglas14/" id="projects-dropdown" onclick="return false;"><u>My favorites</u> <small>▼</small></a>
+ | <a href="https://code.google.com/u/nialldouglas14/" onclick="_CS_click(&#39;/gb/ph/profile&#39;);" title="Profile, Updates, and Settings"><u>Profile</u></a>
+ | <a href="https://www.google.com/accounts/Logout?continue=https%3A%2F%2Fcode.google.com%2Fp%2Fkernel%2Fwiki%2FAIOUserGuide" onclick="_CS_click(&#39;/gb/ph/signout&#39;);"><u>Sign out</u></a>
+
+ </span>
+
+ </div>
+
+ <div class="gbh" style="left: 0pt;"></div>
+ <div class="gbh" style="right: 0pt;"></div>
+
+
+ <div style="height: 1px"></div>
+<!--[if lte IE 7]>
+<div style="text-align:center;">
+Your version of Internet Explorer is not supported. Try a browser that
+contributes to open source, such as <a href="http://www.firefox.com">Firefox</a>,
+<a href="http://www.google.com/chrome">Google Chrome</a>, or
+<a href="http://code.google.com/chrome/chromeframe/">Google Chrome Frame</a>.
+</div>
+<![endif]-->
+
+
+
+ <table style="padding:0px; margin: 0px 0px 10px 0px; width:100%" cellpadding="0" cellspacing="0" itemscope="" itemtype="http://schema.org/CreativeWork">
+ <tbody><tr style="height: 58px;">
+
+
+
+ <td id="plogo">
+ <link itemprop="url" href="https://code.google.com/p/kernel">
+ <a href="https://code.google.com/p/kernel/">
+
+ <img src="./AIOUserGuide_files/defaultlogo.png" alt="Logo" itemprop="image">
+
+ </a>
+ </td>
+
+ <td style="padding-left: 0.5em">
+
+ <div id="pname">
+ <a href="https://code.google.com/p/kernel/"><span itemprop="name">kernel</span></a>
+ </div>
+
+ <div id="psum">
+ <a id="project_summary_link" href="https://code.google.com/p/kernel/"><span itemprop="description">Google production server Linux kernel development</span></a>
+
+ </div>
+
+
+ </td>
+ <td style="white-space:nowrap;text-align:right; vertical-align:bottom;">
+
+ <form action="https://code.google.com/hosting/search">
+ <input size="30" name="q" value="" type="text">
+
+ <input type="submit" name="projectsearch" value="Search projects">
+ </form>
+
+ </td></tr>
+ </tbody></table>
+
+</div>
+
+
+<div id="mt" class="gtb">
+ <a href="https://code.google.com/p/kernel/" class="tab ">Project&nbsp;Home</a>
+
+
+
+
+ <a href="https://code.google.com/p/kernel/downloads/list" class="tab ">Downloads</a>
+
+
+
+
+
+ <a href="https://code.google.com/p/kernel/w/list" class="tab active">Wiki</a>
+
+
+
+
+
+
+
+ <a href="https://code.google.com/p/kernel/wiki/Git?tm=4" class="tab ">Source</a>
+
+
+
+
+
+
+
+
+ <div class="gtbc"></div>
+</div>
+<table cellspacing="0" cellpadding="0" width="100%" align="center" border="0" class="st">
+ <tbody><tr>
+
+
+
+ <td class="subt">
+ <div class="issueDetail">
+<div class="isf">
+
+ <span class="inIssueList">
+ <span>Search</span>
+ <form action="https://code.google.com/p/kernel/w/list" method="GET" style="display:inline">
+ <select id="can" name="can">
+ <option disabled="disabled">Search within:</option>
+
+ <option value="1">&nbsp;All wiki pages</option>
+ <option value="3">&nbsp;Featured pages</option>
+ <option value="2" selected="selected">&nbsp;Current pages</option>
+
+
+ <option value="5">&nbsp;My starred pages</option>
+
+ <option value="4">&nbsp;Deprecated pages</option>
+
+ </select>
+ <span>for</span>
+ <span id="qq"><input type="text" size="38" id="searchq" name="q" value="" autocomplete="on"></span>
+
+
+
+ <input type="submit" value="Search">
+ </form>
+ </span>
+
+
+
+
+
+
+
+
+
+</div>
+</div>
+
+ </td>
+
+
+
+
+
+
+ <td align="right" valign="top" class="bevel-right"></td>
+ </tr>
+</tbody></table>
+
+
+<script type="text/javascript">
+ var cancelBubble = false;
+ function _go(url) { document.location = url; }
+</script>
+<div id="maincol">
+
+
+
+
+
+
+
+
+
+ <style type="text/css">
+ .delcom { background: #e8e8e8 }
+ .commentcontent {
+ margin: 2em;
+ padding: 0px 10px;
+ width: 66em;
+ }
+ .artifactcomment {
+ border-top: 3px solid #c3d9ff;
+ }
+ #commentform {
+ border-top: 3px solid #c3d9ff;
+ }
+ </style>
+
+<div id="wikipage">
+<table>
+ <tbody><tr>
+
+
+ <td style="vertical-align:top; padding-left:5px">
+
+ <div id="wikiheader">
+
+ <img width="15" height="15" id="star_img" src="./AIOUserGuide_files/star_off.gif" style="cursor:pointer" onclick="_CS_toggleStar(this,
+ {&#39;scope&#39;: &#39;wiki&#39;,
+ &#39;user&#39;: &#39;_CURRENT_USER&#39;,
+ &#39;item&#39;: &#39;kernel:AIOUserGuide&#39;
+ });">
+
+ <span style="font-size:120%;font-weight:bold">AIOUserGuide</span>
+ &nbsp;
+ <div>
+
+ <i>A description of how to use AIO</i>
+
+
+
+ <div id="wikiauthor" style="float:right">
+ Updated <span title="Mon Apr 21 08:39:31 2014">
+ Apr 21, 2014</span>
+
+ by
+
+ <a class="userlink" href="https://code.google.com/u/111678707898441125339/">dehrenb...@google.com</a>
+
+ </div>
+ </div>
+ </div>
+
+ <div id="wikicontent">
+ <div class="vt" id="wikimaincol">
+ <h1><a name="Introduction"></a>Introduction<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Introduction" class="section_anchor"></a></h1><p>The Asynchronous Input/Output (AIO) interface allows many I/O requests to be submitted in parallel without the overhead of a thread per request. The purpose of this document is to explain how to use the Linux AIO interface, namely the function family <tt>io_setup</tt>, <tt>io_submit</tt>, <tt>io_getevents</tt>, <tt>io_destroy</tt>. Currently, the AIO interface is best for <tt>O_DIRECT</tt> access to a raw block device like a disk, flash drive or storage array. </p><h1><a name="What_is_AIO?"></a>What is AIO?<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#What_is_AIO?" class="section_anchor"></a></h1><p>Input and output functions involve a device, like a disk or flash drive, which works much slower than the CPU. Consequently, the CPU can be doing other things while waiting for an operation on the device to complete. There are multiple ways to handle this: </p><ul><li>In the <strong>synchronous I/O</strong> model, the application issues a request from a thread. The thread blocks until the operation is complete. The operating system creates the illusion that issuing the request to the device and receiving the result was just like any other operation that would proceed just on the CPU, but in reality, it may switch in other threads or processes to make use of the CPU resources and to allow other device requests to be issued to the device in parallel, originating from the same CPU. </li><li>In the <strong>asynchronous I/O (AIO)</strong> model, the application can submit one or many requests from a thread. Submitting a request does not cause the thread to block, and instead the thread can proceed to do other computations and submit further requests to the device while the original request is in flight. The application is expected to process completions and organize logical computations itself without depending on threads to organize the use of data. </li></ul><p></p><p>Asynchronous I/O can be considered “lower level” than synchronous I/O because it does not make use of a system-provided concept of threads to organize its computation. However, it is often more efficient to use AIO than synchronous I/O due the nondeterministic overhead of threads. </p><h1><a name="The_Linux_AIO_model"></a>The Linux AIO model<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#The_Linux_AIO_model" class="section_anchor"></a></h1><p>The Linux AIO model is used as follows: </p><ol><li>Open an <a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#I/O_context">I/O context</a> to submit and reap I/O requests from. </li><li>Create one or more <a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Submitting_requests">request objects</a> and set them up to represent the desired operation </li><li>Submit these requests to the I/O context, which will send them down to the device driver to process on the device </li><li><a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Processing_results">Reap completions</a> from the I/O context in the form of event completion objects, </li><li>Return to step 2 as needed. </li></ol><p></p><h1><a name="I/O_context"></a>I/O context<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#I/O_context" class="section_anchor"></a></h1><p><tt>io_context_t</tt> is a pointer-sized opaque datatype that represents an “AIO context”. It can be safely passed around by value. Requests in the form of a struct iocb are submitted to an <tt>io_context_t</tt> and completions are read from the <tt>io_context_t</tt>. Internally, this structure contains a queue of completed requests. The length of the queue forms an upper bound on the number of concurrent requests which may be submitted to the <tt>io_context_t</tt>. </p><p>To create a new io_context_t, use the function </p><pre class="prettyprint"><span class="kwd">int</span><span class="pln"> io_setup</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> maxevents</span><span class="pun">,</span><span class="pln"> io_context_t </span><span class="pun">*</span><span class="pln">ctxp</span><span class="pun">);</span></pre><p>Here, ctxp is the output and maxevents is the input. The function creates an io_context_t with an internal queue of length maxevents. To deallocate an io_context_t, use </p><pre class="prettyprint"><span class="kwd">int</span><span class="pln"> io_destroy</span><span class="pun">(</span><span class="pln">io_context_t ctx</span><span class="pun">);</span></pre><p>There is a system-wide maximum number of allocated <tt>io_context_t</tt> objects, set at 65536. </p><p>An <tt>io_context_t</tt> object can be shared between threads, both for submission and completion. No guarantees are provided about ordering of submission and completion with respect to interaction from multiple threads. There may be performance implications from sharing <tt>io_context_t</tt> objects between threads. </p><h1><a name="Submitting_requests"></a>Submitting requests<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Submitting_requests" class="section_anchor"></a></h1><p><tt>struct iocb</tt> represents a single request for a read or write operation. The following struct shows a simplification on the struct definition; a full definition is found in <tt>&lt;libaio.h&gt;</tt> within the libaio source code. </p><pre class="prettyprint"><span class="kwd">struct</span><span class="pln"> iocb </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="pun">*</span><span class="pln">data</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">short</span><span class="pln"> aio_lio_opcode</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> aio_fildes</span><span class="pun">;</span><span class="pln"><br><br>&nbsp; &nbsp; </span><span class="kwd">union</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="pun">*</span><span class="pln">buf</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </span><span class="kwd">unsigned</span><span class="pln"> </span><span class="kwd">long</span><span class="pln"> nbytes</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </span><span class="kwd">long</span><span class="pln"> </span><span class="kwd">long</span><span class="pln"> offset</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"> c</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"> u</span><span class="pun">;</span><span class="pln"><br></span><span class="pun">};</span></pre><p>The meaning of the fields is as follows: data is a pointer to a user-defined object used to represent the operation </p><ul><li><tt>aio_lio_opcode</tt> is a flag indicate whether the operation is a read (<tt>IO_CMD_PREAD</tt>) or a write (<tt>IO_CMD_PWRITE</tt>) or one of the other supported operations </li><li><tt>aio_fildes</tt> is the fd of the file that the iocb reads or writes </li><li><tt>buf</tt> is the pointer to memory that is read or written </li><li><tt>nbytes</tt> is the length of the request </li><li><tt>offset</tt> is the initial offset of the read or write within the file </li></ul>The convenience functions <tt>io_prep_pread</tt> and <tt>io_prep_pwrite</tt> can be used to initialize a <tt>struct iocb</tt>. <p></p><p>New operations are sent to the device with <tt>io_submit</tt>. </p><pre class="prettyprint"><span class="kwd">int</span><span class="pln"> io_submit</span><span class="pun">(</span><span class="pln">io_context_t ctx</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">long</span><span class="pln"> nr</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">struct</span><span class="pln"> iocb </span><span class="pun">*</span><span class="pln">ios</span><span class="pun">[]);</span></pre><p><tt>io_submit</tt> allows an array of pointers to <tt>struct iocb</tt>s to be submitted all at once. In this function call, <tt>nr</tt> is the length of the <tt>ios</tt> array. If multiple operations are sent in one array, then no ordering guarantees are given between the <tt>iocb</tt>s. Submitting in larger batches sometimes results in a performance improvement due to a reduction in CPU usage. A performance improvement also sometimes results from keeping many I/Os ‘in flight’ simultaneously. </p><p>If the submission includes too many iocbs such that the internal queue of the <tt>io_context_t</tt> would overfill on completion, then <tt>io_submit</tt> will return a non-zero number and set <tt>errno</tt> to <tt>EAGAIN</tt>. </p><p>When used under the right conditions, <tt>io_submit</tt> should not block. However, when used in certain ways, it may block, undermining the purpose of asynchronous I/O. If this is a problem for your application, be sure to use the <tt>O_DIRECT</tt> flag when opening a file, and operate on a raw block device. Work is ongoing to fix the problem. </p><h1><a name="Processing_results"></a>Processing results<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Processing_results" class="section_anchor"></a></h1><p>Completions read from an <tt>io_context_t</tt> are of the type <tt>struct io_event</tt>, which contains the following relevant fields. </p><pre class="prettyprint"><span class="kwd">struct</span><span class="pln"> io_event </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="pun">*</span><span class="pln">data</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> iocb </span><span class="pun">*</span><span class="pln">obj</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">long</span><span class="pln"> </span><span class="kwd">long</span><span class="pln"> res</span><span class="pun">;</span><span class="pln"><br></span><span class="pun">};</span></pre><p>Here, data is the same data pointer that was passed in with the <tt>struct iocb</tt>, and obj is the original <tt>struct iocb</tt>. <tt>res</tt> is the return value of the read or write. </p><p>Completions are reaped with <tt>io_getevents</tt>. </p><pre class="prettyprint"><span class="kwd">int</span><span class="pln"> io_getevents</span><span class="pun">(</span><span class="pln">io_context_t ctx_id</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">long</span><span class="pln"> min_nr</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">long</span><span class="pln"> nr</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">struct</span><span class="pln"> io_event </span><span class="pun">*</span><span class="pln">events</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">struct</span><span class="pln"> timespec </span><span class="pun">*</span><span class="pln">timeout</span><span class="pun">);</span></pre><p>This function has a good number of parameters, so an explanation is in order: </p><ul><li><tt>ctx_id</tt> is the io_context_t that is being reaped from. </li><li><tt>min_nr</tt> is the minimum number of io_events to return. io_gevents will block until there are min_nr completions to report, if this is not already the case when the function call is made. </li><li><tt>nr</tt> is the maximum number of completions to return. It is expected to be the length of the events array. </li><li><tt>events</tt> is an array of <tt>io_event</tt>s into which the information about completions is written. </li><li><tt>timeout</tt> is the maximum time that a call to <tt>io_getevents</tt> may block until it will return. If <tt>NULL</tt> is passed, then <tt>io_getevents</tt> will block until <tt>min_nr</tt> completions are available. </li></ul><p></p><p>The return value represents how many completions were reported, ie how much of events was written. The return value will be between <tt>0</tt> and <tt>nr</tt>. The return value may be lower than <tt>min_nr</tt> if the timeout expires; if the timeout is <tt>NULL</tt>, then the return value will be between <tt>min_nr</tt> and <tt>nr</tt>. </p><p>The parameters give a broad range of flexibility in how AIO can be used. </p><ul><li><tt>min_nr</tt> = 0 (or, equivalently, timeout = 0). This option forms a non-blocking polling technique: it will always return immediately, regardless of whether any completions are available. It makes sense to use min_nr = 0 when calling io_getevents as part of a main run-loop of an application, on each iteration. </li><li><tt>min_nr</tt> = 1. This option blocks until a single completion is available. This parameter is the minimum value which will produce a blocking call, and therefore may be the best value for low latency operations for some users. When an application notices that an <tt>eventfd</tt> corresponding to an <tt>iocb</tt> is triggered (see the next section about <tt>epoll</tt>), then the application can call <tt>io_getevents</tt> on the corresponding io_context_t with a guarantee that no blocking will occur. </li><li><tt>min_nr</tt> &gt; 1. This option waits for multiple completions to return, unless the timeout expires. Waiting for multiple completions may improve throughput due to reduced CPU usage, both due to fewer <tt>io_getevents</tt> calls and because if there is more space in the completion queue due to the removed completions, then a later <tt>io_submit</tt> call may have a larger granularity, as well as a reduced number of context switches back to the calling thread when the event is available. This option runs the risk of increasing the latency of operations, especially when the operation rate is lower. </li></ul><p></p><p>Even if <tt>min_nr</tt> = 0 or 1, it is useful to make nr a bit bigger for performance reasons: more than one event may be already complete, and it could be processed without multiple calls to <tt>io_getevents</tt>. The only cost of a larger <tt>nr</tt> value library is that the user must allocate a larger array of events and be prepared to accept them. </p><h1><a name="Use_with_epoll"></a>Use with <tt>epoll</tt><a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Use_with_epoll" class="section_anchor"></a></h1><p>Any iocb can be set to notify an eventfd on completion using the libaio function <tt>io_set_eventfd</tt>. The eventfd can be put in an epoll object. When the eventfd is triggered, then the <tt>io_getevents</tt> function can be called on the corresponding <tt>io_context_t</tt>. </p><p>There is no way to use this API to trigger an eventfd only when multiple operations are complete--the <tt>eventfd</tt> will always be triggered on the first operation. Consequently, as described in the previous section, it will often make sense to use <tt>min_nr</tt> = 1 when using <tt>io_getevents</tt> after an <tt>epoll_wait</tt> call that indicates an eventfd involved in AIO. </p><h1><a name="Performance_considerations"></a>Performance considerations<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Performance_considerations" class="section_anchor"></a></h1><ul><li><strong>Blocking during io_submit on ext4, on buffered operations, network access, pipes, etc.</strong> Some operations are not well-represented by the AIO interface. With completely unsupported operations like buffered reads, operations on a socket or pipes, the entire operation will be performed during the io_submit syscall, with the completion available immediately for access with io_getevents. AIO access to a file on a filesystem like ext4 is partially supported: if a metadata read is required to look up the data block (ie if the metadata is not already in memory), then the io_submit call will block on the metadata read. Certain types of file-enlarging writes are completely unsupported and block for the entire duration of the operation. </li><li><strong>CPU overhead</strong>. When performing small operations on a high-performance device and targeting a very high operation rate from single CPU, a CPU bottleneck may result. This can be resolved by submitting and reaping AIO from multiple threads. </li><li><strong>Lock contention when many CPUs or requests share an io_context_t</strong>. There are several circumstances when the kernel datastructure corresponding to an io_context_t may be accessed from multiple CPUs. For example, multiple threads may submit and get events from the same io_context_t. Some devices may use a single interrupt line for all completions. This can cause the lock to be bounced around between cores or the lock to be heavily contended, resulting in higher CPU usage and potentially lower throughput. One solution is to shard into multiple io_context_t objects, for example by thread and a hash of the address. </li><li><strong>Ensuring sufficient parallelism</strong>. Some devices require many concurrent operations to reach peak performance. This means making sure that there are several operations ‘in flight’ simultaneously. On some high-performance storage devices, when operations are small, tens or hundreds must be submitted in parallel in order to achieve maximum throughput. For disk drives, performance may improve with greater parallelism if the elevator scheduler can make better decisions with more operations simultaneously in flight, but the effect is expected to be small in many situations. </li></ul><h1><a name="Alternatives_to_Linux_AIO"></a>Alternatives to Linux AIO<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Alternatives_to_Linux_AIO" class="section_anchor"></a></h1><ul><li><strong>Thread pool of synchronous I/O threads</strong>. This can work for many use cases, and it may be easier to program with. Unlike with AIO, all functions can be parallelized via a thread pool. Some users find that a thread pool does not work well due to the overhead of threads in terms of CPU and memory bandwidth usage from context switching. This comes up as an especially big problem with small random reads on high-performance storage devices. </li><li><strong>POSIX AIO</strong>. Another asynchronous I/O interface is POSIX AIO. It is implemented as part of glibc. However, the glibc implementation uses a thread pool internally. For cases where this is acceptable, it might be better to use your own thread pool instead. Joel Becker implemented <a href="http://oss.oracle.com/projects/libaio-oracle/files/" rel="nofollow">a version</a> of POSIX AIO based on the Linux AIO mechanism described above. IBM DeveloperWorks has <a href="http://www.ibm.com/developerworks/linux/library/l-async/index.html" rel="nofollow">a good introduction</a> to POSIX AIO. </li><li><strong>epoll</strong>. Linux has limited support for using epoll as a mechanism for asynchronous I/O. For reads to a file opened in buffered mode (that is, without <tt>O_DIRECT</tt>), if the file is opened as <tt>O_NONBLOCK</tt>, then a read will return <tt>EAGAIN</tt> until the relevant part is in memory. Writes to a buffered file are usually immediate, as they are written out with another writeback thread. However, these mechanisms don’t give the level of control over I/O that direct I/O gives. </li></ul><h1><a name="Sample_code"></a>Sample code<a href="https://code.google.com/p/kernel/wiki/AIOUserGuide#Sample_code" class="section_anchor"></a></h1><p>Below is some example code which uses Linux AIO. I wrote it at Google, so it uses the <a href="http://code.google.com/p/google-glog/" rel="nofollow">Google glog logging library</a> and the <a href="http://code.google.com/p/gflags/?redir=1" rel="nofollow">Google gflags command-line flags library</a>, as well as a loose interpretation of <a href="http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml" rel="nofollow">Google’s C++ coding conventions</a>. When compiling it with gcc, pass <tt>-laio</tt> to dynamically link with libaio. (It isn’t included in glibc, so it must be explicitly included.) </p><pre class="prettyprint"><span class="com">// Code written by Daniel Ehrenberg, released into the public domain</span><span class="pln"><br><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;fcntl.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;gflags/gflags.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;glog/logging.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;libaio.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;stdlib.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;stdio.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;sys/stat.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;sys/types.h&gt;</span><span class="pln"><br><br>DEFINE_string</span><span class="pun">(</span><span class="pln">path</span><span class="pun">,</span><span class="pln"> </span><span class="str">"/tmp/testfile"</span><span class="pun">,</span><span class="pln"> </span><span class="str">"Path to the file to manipulate"</span><span class="pun">);</span><span class="pln"><br>DEFINE_int32</span><span class="pun">(</span><span class="pln">file_size</span><span class="pun">,</span><span class="pln"> </span><span class="lit">1000</span><span class="pun">,</span><span class="pln"> </span><span class="str">"Length of file in 4k blocks"</span><span class="pun">);</span><span class="pln"><br>DEFINE_int32</span><span class="pun">(</span><span class="pln">concurrent_requests</span><span class="pun">,</span><span class="pln"> </span><span class="lit">100</span><span class="pun">,</span><span class="pln"> </span><span class="str">"Number of concurrent requests"</span><span class="pun">);</span><span class="pln"><br>DEFINE_int32</span><span class="pun">(</span><span class="pln">min_nr</span><span class="pun">,</span><span class="pln"> </span><span class="lit">1</span><span class="pun">,</span><span class="pln"> </span><span class="str">"min_nr"</span><span class="pun">);</span><span class="pln"><br>DEFINE_int32</span><span class="pun">(</span><span class="pln">max_nr</span><span class="pun">,</span><span class="pln"> </span><span class="lit">1</span><span class="pun">,</span><span class="pln"> </span><span class="str">"max_nr"</span><span class="pun">);</span><span class="pln"><br><br></span><span class="com">// The size of operation that will occur on the device</span><span class="pln"><br></span><span class="kwd">static</span><span class="pln"> </span><span class="kwd">const</span><span class="pln"> </span><span class="kwd">int</span><span class="pln"> kPageSize </span><span class="pun">=</span><span class="pln"> </span><span class="lit">4096</span><span class="pun">;</span><span class="pln"><br><br></span><span class="kwd">class</span><span class="pln"> </span><span class="typ">AIORequest</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp;</span><span class="kwd">public</span><span class="pun">:</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pun">*</span><span class="pln"> buffer_</span><span class="pun">;</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">virtual</span><span class="pln"> </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">Complete</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> res</span><span class="pun">)</span><span class="pln"> </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"><br><br>&nbsp; </span><span class="typ">AIORequest</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> ret </span><span class="pun">=</span><span class="pln"> posix_memalign</span><span class="pun">(</span><span class="kwd">reinterpret_cast</span><span class="pun">&lt;</span><span class="kwd">void</span><span class="pun">**&gt;(&amp;</span><span class="pln">buffer_</span><span class="pun">),</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;kPageSize</span><span class="pun">,</span><span class="pln"> kPageSize</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; CHECK_EQ</span><span class="pun">(</span><span class="pln">ret</span><span class="pun">,</span><span class="pln"> </span><span class="lit">0</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">virtual</span><span class="pln"> </span><span class="pun">~</span><span class="typ">AIORequest</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; free</span><span class="pun">(</span><span class="pln">buffer_</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br></span><span class="pun">};</span><span class="pln"><br><br></span><span class="kwd">class</span><span class="pln"> </span><span class="typ">Adder</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp;</span><span class="kwd">public</span><span class="pun">:</span><span class="pln"><br>&nbsp; </span><span class="kwd">virtual</span><span class="pln"> </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">Add</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> amount</span><span class="pun">)</span><span class="pln"> </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">virtual</span><span class="pln"> </span><span class="pun">~</span><span class="typ">Adder</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"> </span><span class="pun">};</span><span class="pln"><br></span><span class="pun">};</span><span class="pln"><br><br></span><span class="kwd">class</span><span class="pln"> </span><span class="typ">AIOReadRequest</span><span class="pln"> </span><span class="pun">:</span><span class="pln"> </span><span class="kwd">public</span><span class="pln"> </span><span class="typ">AIORequest</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp;</span><span class="kwd">private</span><span class="pun">:</span><span class="pln"><br>&nbsp; </span><span class="typ">Adder</span><span class="pun">*</span><span class="pln"> adder_</span><span class="pun">;</span><span class="pln"><br><br>&nbsp;</span><span class="kwd">public</span><span class="pun">:</span><span class="pln"><br>&nbsp; </span><span class="typ">AIOReadRequest</span><span class="pun">(</span><span class="typ">Adder</span><span class="pun">*</span><span class="pln"> adder</span><span class="pun">)</span><span class="pln"> </span><span class="pun">:</span><span class="pln"> </span><span class="typ">AIORequest</span><span class="pun">(),</span><span class="pln"> adder_</span><span class="pun">(</span><span class="pln">adder</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"> </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">virtual</span><span class="pln"> </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">Complete</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> res</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; CHECK_EQ</span><span class="pun">(</span><span class="pln">res</span><span class="pun">,</span><span class="pln"> kPageSize</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Read incomplete or error "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> res</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> value </span><span class="pun">=</span><span class="pln"> buffer_</span><span class="pun">[</span><span class="lit">0</span><span class="pun">];</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Read of "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> value </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">" completed"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; adder_</span><span class="pun">-&gt;</span><span class="typ">Add</span><span class="pun">(</span><span class="pln">value</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br></span><span class="pun">};</span><span class="pln"><br><br></span><span class="kwd">class</span><span class="pln"> </span><span class="typ">AIOWriteRequest</span><span class="pln"> </span><span class="pun">:</span><span class="pln"> </span><span class="kwd">public</span><span class="pln"> </span><span class="typ">AIORequest</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp;</span><span class="kwd">private</span><span class="pun">:</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> value_</span><span class="pun">;</span><span class="pln"><br><br>&nbsp;</span><span class="kwd">public</span><span class="pun">:</span><span class="pln"><br>&nbsp; </span><span class="typ">AIOWriteRequest</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> value</span><span class="pun">)</span><span class="pln"> </span><span class="pun">:</span><span class="pln"> </span><span class="typ">AIORequest</span><span class="pun">(),</span><span class="pln"> value_</span><span class="pun">(</span><span class="pln">value</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; buffer_</span><span class="pun">[</span><span class="lit">0</span><span class="pun">]</span><span class="pln"> </span><span class="pun">=</span><span class="pln"> value</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">virtual</span><span class="pln"> </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">Complete</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> res</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; CHECK_EQ</span><span class="pun">(</span><span class="pln">res</span><span class="pun">,</span><span class="pln"> kPageSize</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Write incomplete or error "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> res</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Write of "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> value_ </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">" completed"</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br></span><span class="pun">};</span><span class="pln"><br><br></span><span class="kwd">class</span><span class="pln"> </span><span class="typ">AIOAdder</span><span class="pln"> </span><span class="pun">:</span><span class="pln"> </span><span class="kwd">public</span><span class="pln"> </span><span class="typ">Adder</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp;</span><span class="kwd">public</span><span class="pun">:</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> fd_</span><span class="pun">;</span><span class="pln"><br>&nbsp; io_context_t ioctx_</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> counter_</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> reap_counter_</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> sum_</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> length_</span><span class="pun">;</span><span class="pln"><br><br>&nbsp; </span><span class="typ">AIOAdder</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> length</span><span class="pun">)</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="pun">:</span><span class="pln"> ioctx_</span><span class="pun">(</span><span class="lit">0</span><span class="pun">),</span><span class="pln"> counter_</span><span class="pun">(</span><span class="lit">0</span><span class="pun">),</span><span class="pln"> reap_counter_</span><span class="pun">(</span><span class="lit">0</span><span class="pun">),</span><span class="pln"> sum_</span><span class="pun">(</span><span class="lit">0</span><span class="pun">),</span><span class="pln"> length_</span><span class="pun">(</span><span class="pln">length</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"> </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">Init</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Opening file"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; fd_ </span><span class="pun">=</span><span class="pln"> open</span><span class="pun">(</span><span class="pln">FLAGS_path</span><span class="pun">.</span><span class="pln">c_str</span><span class="pun">(),</span><span class="pln"> O_RDWR </span><span class="pun">|</span><span class="pln"> O_DIRECT </span><span class="pun">|</span><span class="pln"> O_CREAT</span><span class="pun">,</span><span class="pln"> </span><span class="lit">0644</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; PCHECK</span><span class="pun">(</span><span class="pln">fd_ </span><span class="pun">&gt;=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Error opening file"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Allocating enough space for the sum"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; PCHECK</span><span class="pun">(</span><span class="pln">fallocate</span><span class="pun">(</span><span class="pln">fd_</span><span class="pun">,</span><span class="pln"> </span><span class="lit">0</span><span class="pun">,</span><span class="pln"> </span><span class="lit">0</span><span class="pun">,</span><span class="pln"> kPageSize </span><span class="pun">*</span><span class="pln"> length_</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&gt;=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Error in fallocate"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Setting up the io context"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; PCHECK</span><span class="pun">(</span><span class="pln">io_setup</span><span class="pun">(</span><span class="lit">100</span><span class="pun">,</span><span class="pln"> </span><span class="pun">&amp;</span><span class="pln">ioctx_</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&gt;=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Error in io_setup"</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">virtual</span><span class="pln"> </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">Add</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> amount</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; sum_ </span><span class="pun">+=</span><span class="pln"> amount</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Adding "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> amount </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">" for a total of "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> sum_</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">SubmitWrite</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Submitting a write to "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> counter_</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> iocb iocb</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> iocb</span><span class="pun">*</span><span class="pln"> iocbs </span><span class="pun">=</span><span class="pln"> </span><span class="pun">&amp;</span><span class="pln">iocb</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="typ">AIORequest</span><span class="pln"> </span><span class="pun">*</span><span class="pln">req </span><span class="pun">=</span><span class="pln"> </span><span class="kwd">new</span><span class="pln"> </span><span class="typ">AIOWriteRequest</span><span class="pun">(</span><span class="pln">counter_</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; io_prep_pwrite</span><span class="pun">(&amp;</span><span class="pln">iocb</span><span class="pun">,</span><span class="pln"> fd_</span><span class="pun">,</span><span class="pln"> req</span><span class="pun">-&gt;</span><span class="pln">buffer_</span><span class="pun">,</span><span class="pln"> kPageSize</span><span class="pun">,</span><span class="pln"> counter_ </span><span class="pun">*</span><span class="pln"> kPageSize</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; iocb</span><span class="pun">.</span><span class="pln">data </span><span class="pun">=</span><span class="pln"> req</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> res </span><span class="pun">=</span><span class="pln"> io_submit</span><span class="pun">(</span><span class="pln">ioctx_</span><span class="pun">,</span><span class="pln"> </span><span class="lit">1</span><span class="pun">,</span><span class="pln"> </span><span class="pun">&amp;</span><span class="pln">iocbs</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; CHECK_EQ</span><span class="pun">(</span><span class="pln">res</span><span class="pun">,</span><span class="pln"> </span><span class="lit">1</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">WriteFile</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; reap_counter_ </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">for</span><span class="pln"> </span><span class="pun">(</span><span class="pln">counter_ </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"> counter_ </span><span class="pun">&lt;</span><span class="pln"> length_</span><span class="pun">;</span><span class="pln"> counter_</span><span class="pun">++)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="typ">SubmitWrite</span><span class="pun">();</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="typ">Reap</span><span class="pun">();</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="typ">ReapRemaining</span><span class="pun">();</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">SubmitRead</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Submitting a read from "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> counter_</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> iocb iocb</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> iocb</span><span class="pun">*</span><span class="pln"> iocbs </span><span class="pun">=</span><span class="pln"> </span><span class="pun">&amp;</span><span class="pln">iocb</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="typ">AIORequest</span><span class="pln"> </span><span class="pun">*</span><span class="pln">req </span><span class="pun">=</span><span class="pln"> </span><span class="kwd">new</span><span class="pln"> </span><span class="typ">AIOReadRequest</span><span class="pun">(</span><span class="kwd">this</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; io_prep_pread</span><span class="pun">(&amp;</span><span class="pln">iocb</span><span class="pun">,</span><span class="pln"> fd_</span><span class="pun">,</span><span class="pln"> req</span><span class="pun">-&gt;</span><span class="pln">buffer_</span><span class="pun">,</span><span class="pln"> kPageSize</span><span class="pun">,</span><span class="pln"> counter_ </span><span class="pun">*</span><span class="pln"> kPageSize</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; iocb</span><span class="pun">.</span><span class="pln">data </span><span class="pun">=</span><span class="pln"> req</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> res </span><span class="pun">=</span><span class="pln"> io_submit</span><span class="pun">(</span><span class="pln">ioctx_</span><span class="pun">,</span><span class="pln"> </span><span class="lit">1</span><span class="pun">,</span><span class="pln"> </span><span class="pun">&amp;</span><span class="pln">iocbs</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; CHECK_EQ</span><span class="pun">(</span><span class="pln">res</span><span class="pun">,</span><span class="pln"> </span><span class="lit">1</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">ReadFile</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; reap_counter_ </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">for</span><span class="pln"> </span><span class="pun">(</span><span class="pln">counter_ </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"> counter_ </span><span class="pun">&lt;</span><span class="pln"> length_</span><span class="pun">;</span><span class="pln"> counter_</span><span class="pun">++)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="typ">SubmitRead</span><span class="pun">();</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="typ">Reap</span><span class="pun">();</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="typ">ReapRemaining</span><span class="pun">();</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> </span><span class="typ">DoReap</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> min_nr</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Reaping between "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> min_nr </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">" and "</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </span><span class="pun">&lt;&lt;</span><span class="pln"> FLAGS_max_nr </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">" io_events"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> io_event</span><span class="pun">*</span><span class="pln"> events </span><span class="pun">=</span><span class="pln"> </span><span class="kwd">new</span><span class="pln"> io_event</span><span class="pun">[</span><span class="pln">FLAGS_max_nr</span><span class="pun">];</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> timespec timeout</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; timeout</span><span class="pun">.</span><span class="pln">tv_sec </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; timeout</span><span class="pun">.</span><span class="pln">tv_nsec </span><span class="pun">=</span><span class="pln"> </span><span class="lit">100000000</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> num_events</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Calling io_getevents"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; num_events </span><span class="pun">=</span><span class="pln"> io_getevents</span><span class="pun">(</span><span class="pln">ioctx_</span><span class="pun">,</span><span class="pln"> min_nr</span><span class="pun">,</span><span class="pln"> FLAGS_max_nr</span><span class="pun">,</span><span class="pln"> events</span><span class="pun">,</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </span><span class="pun">&amp;</span><span class="pln">timeout</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Calling completion function on results"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">for</span><span class="pln"> </span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> i </span><span class="pun">=</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"> i </span><span class="pun">&lt;</span><span class="pln"> num_events</span><span class="pun">;</span><span class="pln"> i</span><span class="pun">++)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> io_event </span><span class="kwd">event</span><span class="pln"> </span><span class="pun">=</span><span class="pln"> events</span><span class="pun">[</span><span class="pln">i</span><span class="pun">];</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="typ">AIORequest</span><span class="pun">*</span><span class="pln"> req </span><span class="pun">=</span><span class="pln"> </span><span class="kwd">static_cast</span><span class="pun">&lt;</span><span class="typ">AIORequest</span><span class="pun">*&gt;(</span><span class="kwd">event</span><span class="pun">.</span><span class="pln">data</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; req</span><span class="pun">-&gt;</span><span class="typ">Complete</span><span class="pun">(</span><span class="kwd">event</span><span class="pun">.</span><span class="pln">res</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="kwd">delete</span><span class="pln"> req</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">delete</span><span class="pln"> events</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; <br>LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Reaped "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> num_events </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">" io_events"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; reap_counter_ </span><span class="pun">+=</span><span class="pln"> num_events</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">return</span><span class="pln"> num_events</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">Reap</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">if</span><span class="pln"> </span><span class="pun">(</span><span class="pln">counter_ </span><span class="pun">&gt;=</span><span class="pln"> FLAGS_min_nr</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="typ">DoReap</span><span class="pun">(</span><span class="pln">FLAGS_min_nr</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">void</span><span class="pln"> </span><span class="typ">ReapRemaining</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">while</span><span class="pln"> </span><span class="pun">(</span><span class="pln">reap_counter_ </span><span class="pun">&lt;</span><span class="pln"> length_</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; </span><span class="typ">DoReap</span><span class="pun">(</span><span class="lit">1</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="pun">~</span><span class="typ">AIOAdder</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Closing AIO context and file"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; io_destroy</span><span class="pun">(</span><span class="pln">ioctx_</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; close</span><span class="pun">(</span><span class="pln">fd_</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> </span><span class="typ">Sum</span><span class="pun">()</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Writing consecutive integers to file"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="typ">WriteFile</span><span class="pun">();</span><span class="pln"><br>&nbsp; &nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Reading consecutive integers from file"</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="typ">ReadFile</span><span class="pun">();</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">return</span><span class="pln"> sum_</span><span class="pun">;</span><span class="pln"><br>&nbsp; </span><span class="pun">}</span><span class="pln"><br></span><span class="pun">};</span><span class="pln"><br><br></span><span class="kwd">int</span><span class="pln"> main</span><span class="pun">(</span><span class="kwd">int</span><span class="pln"> argc</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">char</span><span class="pun">*</span><span class="pln"> argv</span><span class="pun">[])</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; google</span><span class="pun">::</span><span class="typ">ParseCommandLineFlags</span><span class="pun">(&amp;</span><span class="pln">argc</span><span class="pun">,</span><span class="pln"> </span><span class="pun">&amp;</span><span class="pln">argv</span><span class="pun">,</span><span class="pln"> </span><span class="kwd">true</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="typ">AIOAdder</span><span class="pln"> adder</span><span class="pun">(</span><span class="pln">FLAGS_file_size</span><span class="pun">);</span><span class="pln"><br>&nbsp; adder</span><span class="pun">.</span><span class="typ">Init</span><span class="pun">();</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> sum </span><span class="pun">=</span><span class="pln"> adder</span><span class="pun">.</span><span class="typ">Sum</span><span class="pun">();</span><span class="pln"><br>&nbsp; </span><span class="kwd">int</span><span class="pln"> expected </span><span class="pun">=</span><span class="pln"> </span><span class="pun">(</span><span class="pln">FLAGS_file_size </span><span class="pun">*</span><span class="pln"> </span><span class="pun">(</span><span class="pln">FLAGS_file_size </span><span class="pun">-</span><span class="pln"> </span><span class="lit">1</span><span class="pun">))</span><span class="pln"> </span><span class="pun">/</span><span class="pln"> </span><span class="lit">2</span><span class="pun">;</span><span class="pln"><br>&nbsp; LOG</span><span class="pun">(</span><span class="pln">INFO</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"AIO is complete"</span><span class="pun">;</span><span class="pln"><br>&nbsp; CHECK_EQ</span><span class="pun">(</span><span class="pln">sum</span><span class="pun">,</span><span class="pln"> expected</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">"Expected "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> expected </span><span class="pun">&lt;&lt;</span><span class="pln"> </span><span class="str">" Got "</span><span class="pln"> </span><span class="pun">&lt;&lt;</span><span class="pln"> sum</span><span class="pun">;</span><span class="pln"><br>&nbsp; printf</span><span class="pun">(</span><span class="str">"Successfully calculated that the sum of integers from 0"</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;</span><span class="str">" to %d is %d\n"</span><span class="pun">,</span><span class="pln"> FLAGS_file_size </span><span class="pun">-</span><span class="pln"> </span><span class="lit">1</span><span class="pun">,</span><span class="pln"> sum</span><span class="pun">);</span><span class="pln"><br>&nbsp; </span><span class="kwd">return</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"><br></span><span class="pun">}</span></pre>
+ </div>
+ </div>
+ </td></tr><tr>
+</tr></tbody></table>
+ </div>
+
+
+
+ <div id="wikicommentcol">
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<div class="collapse">
+
+
+
+
+
+<div id="commentlist">
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/103967590852603522648/">bert.hub...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Wed May 30 01:42:21 2012">May 30, 2012</span>
+ <div>
+<div class="commentcontent">
+<p>Hi Daniel, </p><p>Thanks for writing this fine document. I reference it from <a href="http://bert-hubert.blogspot.com/2012/05/on-linux-asynchronous-file-io.html" rel="nofollow">http://bert-hubert.blogspot.com/2012/05/on-linux-asynchronous-file-io.html</a> - thanks! </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/115155307146537731796/">ersun.wa...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Sun Aug 12 13:57:26 2012">Aug 12, 2012</span>
+ <div>
+<div class="commentcontent">
+<p>Great write-up. Linked here: <a href="http://webfiveoh.com/content/guides/2012/aug/mon-13th/linux-asynchronous-io-and-libaio.html" rel="nofollow">http://webfiveoh.com/content/guides/2012/aug/mon-13th/linux-asynchronous-io-and-libaio.html</a> </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/103790604532162480537/">una...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Tue Feb 19 03:21:40 2013">Feb 19, 2013</span>
+ <div>
+<div class="commentcontent">
+<p>Epoll on linux doesn't support regular files. It always returns ENOPERM when registering them with the epollctl syscall. I've tried with ext4, btrfs, xfs &amp; jfs on Linux 3.6 with the same result. </p><pre class="prettyprint"><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;sys/epoll.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;sys/types.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;sys/stat.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;fcntl.h&gt;</span><span class="pln"><br></span><span class="com">#include</span><span class="pln"> </span><span class="str">&lt;stdio.h&gt;</span><span class="pln"><br><br></span><span class="kwd">int</span><span class="pln"> main</span><span class="pun">(</span><span class="kwd">void</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> ep </span><span class="pun">=</span><span class="pln"> epoll_create1</span><span class="pun">(</span><span class="lit">0</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">int</span><span class="pln"> fd </span><span class="pun">=</span><span class="pln"> open</span><span class="pun">(</span><span class="str">"/root/kernel-uek-2.6.39-300.17.2.el6uek.src.rpm"</span><span class="pun">,</span><span class="pln"> O_RDONLY</span><span class="pun">|</span><span class="pln">O_NONBLOCK</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">struct</span><span class="pln"> epoll_event evt </span><span class="pun">=</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="pun">.</span><span class="pln">events </span><span class="pun">=</span><span class="pln"> EPOLLIN<br>&nbsp; &nbsp; </span><span class="pun">};</span><span class="pln"><br><br>&nbsp; &nbsp; </span><span class="kwd">if</span><span class="pln"> </span><span class="pun">(</span><span class="pln">ep </span><span class="pun">&lt;</span><span class="pln"> </span><span class="lit">0</span><span class="pln"> </span><span class="pun">||</span><span class="pln"> fd </span><span class="pun">&lt;</span><span class="pln"> </span><span class="lit">0</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; printf</span><span class="pun">(</span><span class="str">"Error opening fds.\n"</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="kwd">return</span><span class="pln"> </span><span class="pun">-</span><span class="lit">1</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"><br><br>&nbsp; &nbsp; </span><span class="kwd">if</span><span class="pln"> </span><span class="pun">(</span><span class="pln">epoll_ctl</span><span class="pun">(</span><span class="pln">ep</span><span class="pun">,</span><span class="pln"> EPOLL_CTL_ADD</span><span class="pun">,</span><span class="pln"> fd</span><span class="pun">,</span><span class="pln"> </span><span class="pun">&amp;</span><span class="pln">evt</span><span class="pun">)</span><span class="pln"> </span><span class="pun">&lt;</span><span class="pln"> </span><span class="lit">0</span><span class="pun">)</span><span class="pln"> </span><span class="pun">{</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; perror</span><span class="pun">(</span><span class="str">"epoll_ctl"</span><span class="pun">);</span><span class="pln"><br>&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="kwd">return</span><span class="pln"> </span><span class="pun">-</span><span class="lit">1</span><span class="pun">;</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="pun">}</span><span class="pln"><br>&nbsp; &nbsp; </span><span class="kwd">return</span><span class="pln"> </span><span class="lit">0</span><span class="pun">;</span><span class="pln"><br></span><span class="pun">}</span></pre>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/113801981588238309041/">haghdo...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Fri Mar 15 22:05:23 2013">Mar 15, 2013</span>
+ <div>
+<div class="commentcontent">
+<p>Thanks for your grate article. There is a compilation error in this line num_events = io_getevents(ioctx<i>, min_nr, FLAGS_max_nr, events.get(),&amp;timeout); </i></p><p>it should be something like this num_events = io_getevents(ioctx<i>, min_nr, FLAGS_max_nr, events,&amp;timeout); </i></p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/108474981592634289690/">garethb...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Thu Jun 13 08:56:09 2013">Jun 13, 2013</span>
+ <div>
+<div class="commentcontent">
+<p>It seems the posix_memalign is not actually required, is there some sort of performance benefit to be had by using it?? </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/103724282630885316886/">vishn...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Fri Aug 9 15:26:44 2013">Aug 9, 2013</span>
+ <div>
+<div class="commentcontent">
+<p>The posix_memalign is required and O_DIRECT is required. If I don't align my buffers to PAGE_SIZE, kernel 2.6.32 and 3.3 and 3.5.0-37-generic return -EINVAL in event.res, but event.res2 will be zero. </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/103724282630885316886/">vishn...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Fri Aug 9 17:52:29 2013">Aug 9, 2013</span>
+ <div>
+<div class="commentcontent">
+<p>Actually it's probably O_DIRECT that requires buffer length to be aligned to 512B and offset to be also aligned to 512B. </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+ <a class="userlink" href="https://code.google.com/u/104805717887225221125/">nepor...@gmail.com</a>,
+
+ </span>
+ <span class="date" title="Wed Nov 6 07:35:43 2013">Nov 6, 2013</span>
+ <div>
+<div class="commentcontent">
+<p>Linux 3.12 has a solution to blocking io_submit() on regular files - see <a href="http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7869a4a6c5caa7b2e5c41ccaf46eb3371f88eea7" rel="nofollow">http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7869a4a6c5caa7b2e5c41ccaf46eb3371f88eea7</a> </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by project member
+
+
+
+ <a class="userlink" href="https://code.google.com/u/111678707898441125339/">dehrenb...@google.com</a>,
+
+ </span>
+ <span class="date" title="Mon Apr 21 08:32:41 2014">Apr 21, 2014</span>
+ <div>
+<div class="commentcontent">
+<p>The sample code here is released into the public domain. So feel free to copy it into whatever program you want to write. </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+
+ <div class="artifactcomment">
+
+
+
+
+
+
+ <span class="author">Comment
+
+ by
+
+
+
+
+ <span class="userlink">dehrenb...@chromium.org</span>,
+
+ </span>
+ <span class="date" title="Tue Jul 22 09:58:03 2014">Jul 22, 2014</span>
+ <div>
+<div class="commentcontent">
+<p>haghdo..., thanks, I've fixed the code based on your comment. </p><p>vishn..., that's right, the alignment restriction is actually based on the backing device (for ext4 at least)--if you're using a device with 4k blocks, 4k alignment is needed, otherwise your direct I/O doesn't work. </p><p>una..., that's right, you can't use epoll directly on the file; instead, you have to use it on the ioctx with the io_set_eventfd syscall. </p>
+</div>
+
+
+ </div>
+ </div>
+
+
+</div>
+</div>
+
+
+
+
+ <script type="text/javascript">
+ function delComment(sequence_num, create_time, delete_mode) {
+ var f = document.forms["delcom"];
+ f.sequence_num.value = sequence_num;
+ f.create_time.value = create_time;
+ f.mode.value = delete_mode;
+ f.submit();
+ return false;
+ }
+ </script>
+
+
+ </div>
+
+
+
+
+
+ <div id="commentform">
+ <form action="https://code.google.com/p/kernel/w/detail.do" method="post">
+ <table>
+ <tbody><tr><td class="vt">
+ <input type="hidden" name="pagename" value="AIOUserGuide">
+ <input type="hidden" name="token" value="ABZ6GAciBR7wXJANw0lEk1JMtMA3vIzrjA:1408361093458">
+ <div class="graytext" style="float: right;">
+ Hint: You can use <a href="http://code.google.com/p/support/wiki/WikiSyntax">Wiki Syntax.</a>
+ </div>
+ <div>Enter a comment:</div>
+ <textarea name="content" rows="6" cols="100"></textarea><br><br>
+ <input type="submit" name="submit" value="Submit">
+ </td>
+ </tr></tbody></table>
+ </form>
+ </div>
+
+
+
+
+
+ <form name="delcom" action="https://code.google.com/p/kernel/w/delComment.do" method="POST">
+ <input type="hidden" name="sequence_num" value="">
+ <input type="hidden" name="create_time" value="">
+ <input type="hidden" name="mode" value="">
+ <input type="hidden" name="pagename" value="AIOUserGuide">
+ <input type="hidden" name="token" value="ABZ6GAciBR7wXJANw0lEk1JMtMA3vIzrjA:1408361093458">
+ </form>
+
+
+ <script src="./AIOUserGuide_files/prettify.js"></script>
+ <script type="text/javascript">
+ prettyPrint();
+ </script>
+
+<script type="text/javascript" src="./AIOUserGuide_files/dit_scripts.js"></script>
+
+
+
+
+
+
+ <script type="text/javascript" src="./AIOUserGuide_files/ph_core.js"></script>
+
+ <script type="text/javascript" src="./AIOUserGuide_files/ph_dwiki.js"></script>
+
+
+
+
+</div>
+
+<div id="footer" dir="ltr">
+ <div class="text">
+ <a href="https://code.google.com/projecthosting/terms.html">Terms</a> -
+ <a href="http://www.google.com/privacy.html">Privacy</a> -
+ <a href="https://code.google.com/p/support/">Project Hosting Help</a>
+ </div>
+</div>
+ <div class="hostedBy" style="margin-top: -20px;">
+ <span style="vertical-align: top;">Powered by <a href="http://code.google.com/projecthosting/">Google Project Hosting</a></span>
+ </div>
+
+
+
+
+
+
+
+
+
+
+<div class="menuDiv instance0" id="menuDiv-projects-dropdown" style="display: none;"><div class="menuCategory default"></div><b class="categoryTitle projects" style="display: block;">Projects</b><div class="menuCategory projects"><a class="menuItem" href="https://code.google.com/p/easyshop-for-plone/" style="display: block;">easyshop-for-plone</a></div><b class="categoryTitle starred_projects" style="display: block;">Starred projects</b><div class="menuCategory starred_projects"><a class="menuItem" href="https://code.google.com/p/easyshop-for-plone/" style="display: block;">easyshop-for-plone</a></div><div class="menuCategory controls"><hr class="menuSeparator"><a class="menuItem" href="https://code.google.com/hosting/" style="display: block;">Find open source projects...</a><a class="menuItem" href="https://code.google.com/hosting/createProject" style="display: block;">Create a project...</a></div></div><div class="menuDiv instance1" id="menuDiv-multilogin-dropdown" style="display: none;"><div class="menuCategory default"><span class="menuText" style="display: block;"><b>nialldouglas14@gmail.com</b></span></div><div class="menuCategory controls"><hr class="menuSeparator"><a class="menuItem" href="http://www.google.com/accounts/AddSession?service=code&continue=https%3A%2F%2Fcode.google.com%2Fp%2Fkernel%2Fwiki%2FAIOUserGuide" style="display: block;"><nobr>Sign in with another account...</nobr></a></div></div></body></html> \ No newline at end of file
diff --git a/reference/Linux KAIO/linux-kaio.txt b/reference/Linux KAIO/linux-kaio.txt
new file mode 100644
index 00000000..8497b468
--- /dev/null
+++ b/reference/Linux KAIO/linux-kaio.txt
@@ -0,0 +1,552 @@
+Linux Asynchronous I/O Explained (Last updated: 13 Apr 2012)
+*******************************************************************************
+ by Vasily Tarasov <tarasov AT vasily dot name>
+
+Asynchronoes I/O (AIO) is a method for performing I/O operations so that the
+process that issued an I/O request is not blocked till the data is available.
+Instead, after an I/O request is submitted, the process continues to execute
+its code and can later check the status of the submitted request.
+
+Linux kernel provides only *5* system calls for performing asynchronoes I/O.
+Other AIO functions commonly descibed in the literature are implemented in the
+user space libraries and use the system calls internally. Some libraries can
+also emulate AIO functionality entirely in the user space without any kernel
+support.
+
+There are two main libraries in Linux that facilitate AIO, we will refer to
+them as *libaio* and *librt* (the latter one is a part of libc).
+
+In this text, I first discuss system calls, then libaio, and finaly librt.
+
+AIO System Calls
+*******************************************************************************
+ based on Linux 3.2.1 kernel
+
+AIO system call entry points are located in "fs/aio.c" file in the kernel's
+source code. Types and constants exported to the user space reside in
+"/usr/include/linux/aio_abi.h" header file.
+
+There are only 5 AIO system calls:
+
+* int io_setup(unsigned nr_events, aio_context_t *ctxp);
+
+* int io_destroy(aio_context_t ctx);
+
+* int io_submit(aio_context_t ctx, long nr, struct iocb *cbp[]);
+
+* int io_cancel(aio_context_t ctx, struct iocb *, struct io_event *result);
+
+* int io_getevents(aio_context_t ctx, long min_nr, long nr,
+ struct io_event *events, struct timespec *timeout);
+
+I will demonstrate the usage of these system calls using a sequence of programs
+in the increasing order of their complexity.
+
+Program 1:
+
+>> snip start: 1.c >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+00 #define _GNU_SOURCE /* syscall() is not POSIX */
+01
+02 #include <stdio.h> /* for perror() */
+03 #include <unistd.h> /* for syscall() */
+04 #include <sys/syscall.h> /* for __NR_* definitions */
+05 #include <linux/aio_abi.h> /* for AIO types and constants */
+06
+07 inline int io_setup(unsigned nr, aio_context_t *ctxp)
+08 {
+09 return syscall(__NR_io_setup, nr, ctxp);
+10 }
+11
+12 inline int io_destroy(aio_context_t ctx)
+13 {
+14 return syscall(__NR_io_destroy, ctx);
+15 }
+16
+17 int main()
+18 {
+19 aio_context_t ctx;
+20 int ret;
+21
+22 ctx = 0;
+23
+24 ret = io_setup(128, &ctx);
+25 if (ret < 0) {
+26 perror("io_setup error");
+27 return -1;
+28 }
+29
+30 ret = io_destroy(ctx);
+31 if (ret < 0) {
+32 perror("io_destroy error");
+33 return -1;
+34 }
+35
+36 return 0;
+37 }
+
+<< snip end: 1.c <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+For now, ignore first 17 lines of the code and look at main() function. In line
+24 we call io_setup() system call to create so called "AIO context" in the
+kernel. AIO context is a set of data structures that the kernel supports to
+perform AIO. Every process can have multiple AIO contextes and as such one
+needs an identificator for every AIO context in a process (XXX: come up with a
+handy example how it can be used). Ctx variable of type aio_context_t defined in
+line 19 stores such an identificator in our example. A pointer to ctx variable
+is passed to io_setup() as a second argument and kernel fills this variable
+with a context identifier. Interestingly, aio_context_t is actually just an
+unsigned long defined in the kernel ("linux/aio_abi.h") like that:
+
+typedef unsigned long aio_context_t;
+
+In line 22 we set ctx to 0 which is required by kernel or io_setup() fails with
+-EINVAL error.
+
+The first argument of io_setup() function - 128 in our case - is the maximum
+number of requests that can simultaneously reside in the context. This will be
+explained in more details in the next examples.
+
+In line 30 we destroy just created AIO context by calling io_destroy() system
+call with ctx as an argument.
+
+The lines above 17 are just helpers that allow to call system calls directly. We
+use glibc's syscall() function that invokes any system call by its number. It
+is only required if one wants to call system calls directly without using AIO
+libraries' wrapper functions (provided by libaio and librt). Notice, that
+syscall() functions's return value follows the usual conventions for indicating
+an error: -1, with errno set to a positive value that indicates the error.
+So, we check if the values returned by io_setup() and io_destroy() are less than
+zero to detect the error, and then use perror() function that will print the
+errno.
+
+In the last example we did a minimal thing: created an AIO context and then
+destroyed it immediatelly. In the next example we submit one request to the
+context and then query its status later.
+
+Program 2:
+
+>> snip start: 2.c >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+00 #define _GNU_SOURCE /* syscall() is not POSIX */
+01
+02 #include <stdio.h> /* for perror() */
+03 #include <unistd.h> /* for syscall() */
+04 #include <sys/syscall.h> /* for __NR_* definitions */
+05 #include <linux/aio_abi.h> /* for AIO types and constants */
+06 #include <fcntl.h> /* O_RDWR */
+07 #include <string.h> /* memset() */
+08 #include <inttypes.h> /* uint64_t */
+09
+10 inline int io_setup(unsigned nr, aio_context_t *ctxp)
+11 {
+12 return syscall(__NR_io_setup, nr, ctxp);
+13 }
+14
+15 inline int io_destroy(aio_context_t ctx)
+16 {
+17 return syscall(__NR_io_destroy, ctx);
+18 }
+19
+20 inline int io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp)
+21 {
+22 return syscall(__NR_io_submit, ctx, nr, iocbpp);
+23 }
+24
+25 inline int io_getevents(aio_context_t ctx, long min_nr, long max_nr,
+26 struct io_event *events, struct timespec *timeout)
+27 {
+28 return syscall(__NR_io_getevents, ctx, min_nr, max_nr, events, timeout);
+29 }
+30
+31 int main()
+32 {
+33 aio_context_t ctx;
+34 struct iocb cb;
+35 struct iocb *cbs[1];
+36 char data[4096];
+37 struct io_event events[1];
+38 int ret;
+39 int fd;
+40
+41 fd = open("/tmp/testfile", O_RDWR | O_CREAT);
+42 if (fd < 0) {
+43 perror("open error");
+44 return -1;
+45 }
+46
+47 ctx = 0;
+48
+49 ret = io_setup(128, &ctx);
+50 if (ret < 0) {
+51 perror("io_setup error");
+52 return -1;
+53 }
+54
+55 /* setup I/O control block */
+56 memset(&cb, 0, sizeof(cb));
+57 cb.aio_fildes = fd;
+58 cb.aio_lio_opcode = IOCB_CMD_PWRITE;
+59
+60 /* command-specific options */
+61 cb.aio_buf = (uint64_t)data;
+62 cb.aio_offset = 0;
+63 cb.aio_nbytes = 4096;
+64
+65 cbs[0] = &cb;
+66
+67 ret = io_submit(ctx, 1, cbs);
+68 if (ret != 1) {
+69 if (ret < 0)
+70 perror("io_submit error");
+71 else
+72 fprintf(stderr, "could not sumbit IOs");
+73 return -1;
+74 }
+75
+76 /* get the reply */
+77 ret = io_getevents(ctx, 1, 1, events, NULL);
+78 printf("%d\n", ret);
+79
+80 ret = io_destroy(ctx);
+81 if (ret < 0) {
+82 perror("io_destroy error");
+83 return -1;
+84 }
+85
+86 return 0;
+87 }
+
+<< snip end: 2.c <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+Every I/O request that is submitted to an AIO context is represented by an I/O
+control block structure - struct iocb - declared in line 34. We initialize this
+structure in lines 55-63. First, the whole structure is zeroed, then file
+descriptor (aio_fildes) and command (aio_lio_opcode) fields are set.
+
+File descriptor corresponds to a previously opened file, in our example we
+open "/tmp/testfile" file in line 41.
+
+AIO commands currently supported by Linux kernel are:
+
+IOCB_CMD_PREAD
+ positioned read; corresponds to pread() system call.
+
+IOCB_CMD_PWRITE
+ positioned write; corresponds to pwrite() system call.
+
+IOCB_CMD_FSYNC
+ sync file's data and metadata with disk; corresponds to fsync() system call.
+
+IOCB_CMD_FDSYNC
+ sync file's data and metadata with disk, but only metadata needed to access
+ modified file data is written; corresponds to fdatasync() system call.
+
+IOCB_CMD_PREADV
+ vectored positioned read, sometimes called "scattered input";
+ corresponds to pread() system call.
+
+IOCB_CMD_PWRITEV
+ vectored positioned write, sometimes called "gathered output";
+ corresponds to pwrite() system call.
+
+IOCB_CMD_NOOP
+ defined in the header file, but is not used anywhere else in the kernel.
+
+The semantics of other fields in the iocb structure depends on the command
+specified. For now, we will limit our discussion to IOCB_CMD_PREAD and
+IOCB_CMD_PWRITE commands. After understanding AIO interface for these two
+commands, we will look into the remaining ones.
+
+In lines 60-63 of our running example we set command-specific fields of iocb
+structure: aio_buf and aio_nbytes corresond to a region in memory to which
+data should be read or written to; aio_offset is an absolute offset in a file.
+
+Now, when one I/O control block is ready, we put a pointer to it in an array
+(line 65) and then pass this array to the io_submit() system call (line 67).
+io_submit() takes AIO context ID, size of the array and the array itself as the
+arguments. Notice, that array should contain *pointers* to the iocb structures,
+not the structures themself.
+
+io_submit()'s return code can be one of the following values:
+
+A) ret = (number of iocbs sumbmitted)
+ Ideal case, all iocbs were accepted for processing.
+
+B) 0 < ret < (number of iocbs sumbmitted)
+ io_submit() system call processes iocbs one by one starting from
+ the first entry in the passed array. If submission of some iocb fails,
+ it stops at this point and returns the index of iocb that failed.
+ There is no way to know what is the exact reason of a failure.
+ However, if the very first iocb submission fails, see point C.
+
+C) ret < 0
+ There are two reasons why this could happen:
+ 1) Some error happened even before io_submit() started to iterate
+ over iocbs in the array (e.g., AIO context was invalid).
+ 2) The submission of the very first iocb (cbx[0]) failed).
+
+So, in our example, we handle io_submit()'s return code in an unusual way. If
+return code is not equal to the number of iocbs, then that is a clear error but
+we don't know its reason (errno is not set). Consequently, we use
+fprintf(stderr, ...) function to print error notification on the screen.
+Otherwise, if return code is less than zero, then we know the error (errno is
+set) and use perror() function instead. Notice, that in case of a single iocb
+in the array (as in our example) such a complex error handling makes less sense:
+if the first (and only) iocb fails, we are guaranteed to get an error
+information (see point C above). We handle error in a more complex way in this
+example only to reuse the same code later, when we submit multiple iocbs in a
+single io_submit() call.
+
+After iocb is submitted we can perform any other actions without waiting for I/O
+to complete. For every completed I/O request (successfully or unsuccessfully)
+kernel creates an io_event structure. To obtain the list of io_events (and
+consequently all completed iocbs) io_getevent() system call should be used (line
+77). When calling io_getevents(), one needs to specify:
+
+a) which AIO context to get events from (ctx variable)
+
+b) a buffer where the kernel should load events to (events varaiable)
+
+c) minimal number of events one wants to get (first 1 in our program).
+ If less then this number of iocbs are currently completed,
+ io_getevents() will block till enough events appear. See point e)
+ for more details on how to control blocking time.
+
+d) maximum number of events one wants to get. This usually is
+ the size of the events buffer (second 1 in our program)
+
+e) If not enough events are available, we don't want to wait forever.
+ One can specify a relative deadline as the last argument.
+ NULL in this case means to wait infinitely.
+ If one wants io_getevents() not to block at all then
+ timespec timeout structure need to be initialzed to zero
+ seconds and zero nanoseconds.
+
+The return code of io_getevents can be:
+
+A) ret = (max number of events)
+ All events that fit in the user provided buffer were obtained
+ from the kernel. There might be more pending events in the kernel.
+B) (min number of events) <= ret <= (max number of events)
+ All currently available events were read from the kernel and no
+ blocking happened.
+C) 0 < ret < (min number of events)
+ All currently available events were read from the kernel and
+ we blocked to wait for the time user has specified.
+E) ret = 0
+ no events are available XXX:? does blocking happen in this case?..
+
+F) ret < 0
+ an error happened
+
+
+TO BE CONTINUED...
+
+
+/proc/sys/fs/aio-max-nr
+/proc/sys/fs/aio-nr
+
+Note that timeout is relative and will be updated if not NULL and the operation
+blocks
+
+Check how vectors a provide to vectored PREADV and PWRITEV commands.
+
+Other fields to fill/explain:
+
+ /* these are internal to the kernel/libc. */
+ __u64 aio_data; /* data to be returned in event's data */
+ __u32 PADDED(aio_key, aio_reserved1);
+ /* the kernel sets aio_key to the req # */
+
+ /* common fields */
++++ __u16 aio_lio_opcode; /* see IOCB_CMD_ above */
+ __s16 aio_reqprio;
+ __u32 aio_fildes;
+
+ __u64 aio_buf;
+ __u64 aio_nbytes;
+ __s64 aio_offset;
+
+ /* extra parameters */
+ __u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
+
+ /* flags for the "struct iocb" */
+ __u32 aio_flags;
+
+ /*
+ * if the IOCB_FLAG_RESFD flag of "aio_flags" is set, this is an
+ * eventfd to signal AIO readiness to
+ */
+ __u32 aio_resfd;
+
+*** SYNC RELATED COMMANDS ***
+IOCB_CMD_FSYNC
+ sync file's data and metadata with disk; corresponds to fsync() system call.
+
+IOCB_CMD_FDSYNC
+ sync file's data and metadata with disk, but only metadata needed to access
+ modified file data is written; corresponds to fdatasync() system call.
+
+
+*** VECTORED INPUT and OUTPUT ***
+IOCB_CMD_PREADV
+ vectored positioned read, sometimes called "scattered input";
+ corresponds to pread() system call.
+
+IOCB_CMD_PWRITEV
+ vectored positioned write, sometimes called "gathered output";
+ corresponds to pwrite() system call.
+
+*** OTHER COMMANDS ***
+IOCB_CMD_NOOP
+ defined in the header file, but is not used anywhere else in the kernel.
+
+XXX: May be discass Poll and other semi-existing commands here?...
+
+*********************************************************
+********************* LIBAIO LIBRARY ********************
+*********************************************************
+
+libaio:
+/lib64/libaio.so.1 (shared library)
+
+libaio-devel:
+/usr/include/libaio.h (header library)
+/usr/lib64/libaio.a (static library)
+
+Functions:
+
+a) Actual system call wrappers:
+
+int io_setup(int maxevents, io_context_t *ctxp);
+int io_destroy(io_context_t ctx);
+int io_submit(io_context_t ctx, long nr, struct iocb *ios[]);
+int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt);
+io_getevents(io_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout);
+
+io_context_t is a pointer to an non-existing stucture:
+
+typedef struct io_context *io_context_t;
+
+Not a single line of code in any user tool or in the libaio library looks at the
+members of 'struct io_context'. So, gcc happily compiles the code even though
+struct io_context is not defined. This structure is probably defined just for
+type checking. The rule of thumb when using libaio is just to declare all
+variables as io_context_t and forget that it actually is a pointer!
+
+b) Convenient macroses:
+
+static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+static inline void io_prep_preadv(struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt, long long offset)
+static inline void io_prep_pwritev(struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt, long long offset)
+
+static inline void io_prep_poll(struct iocb *iocb, int fd, int events)
+static inline void io_prep_fsync(struct iocb *iocb, int fd)
+static inline void io_prep_fdsync(struct iocb *iocb, int fd)
+
+static inline int io_poll(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd, int events)
+static inline int io_fsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd)
+static inline int io_fdsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd)
+
+static inline void io_set_eventfd(struct iocb *iocb, int eventfd);
+
+*********************************************************
+******** MATCHING LIBAIO AND KERNEL INTERFACE ***********
+*********************************************************
+
+libaio.h redefines some of the kernel definitions (god know why),
+but they match at the binary level. E.g., this is kernel
+exported definition of iocb:
+
+struct iocb {
+ /* these are internal to the kernel/libc. */
+ __u64 aio_data; /* data to be returned in event's data */
+ __u32 PADDED(aio_key, aio_reserved1);
+ /* the kernel sets aio_key to the req # */
+
+ /* common fields */
+ __u16 aio_lio_opcode; /* see IOCB_CMD_ above */
+ __s16 aio_reqprio;
+ __u32 aio_fildes;
+
+ __u64 aio_buf;
+ __u64 aio_nbytes;
+ __s64 aio_offset;
+
+ /* extra parameters */
+ __u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
+
+ /* flags for the "struct iocb" */
+ __u32 aio_flags;
+
+ /*
+ * if the IOCB_FLAG_RESFD flag of "aio_flags" is set, this is an
+ * eventfd to signal AIO readiness to
+ */
+ __u32 aio_resfd;
+}; /* 64 bytes */
+
+And this is definition of iocb by libaio.h:
+
+struct io_iocb_common {
+ PADDEDptr(void *buf, __pad1);
+ PADDEDul(nbytes, __pad2);
+ long long offset;
+ long long __pad3;
+ unsigned flags;
+ unsigned resfd;
+}; /* result code is the amount read or -'ve errno */
+
+
+struct iocb {
+ PADDEDptr(void *data, __pad1); /* Return in the io completion event */
+ PADDED(unsigned key, __pad2); /* For use in identifying io requests */
+
+ short aio_lio_opcode;
+ short aio_reqprio;
+ int aio_fildes;
+
+ union {
+ struct io_iocb_common c;
+ struct io_iocb_vector v;
+ struct io_iocb_poll poll;
+ struct io_iocb_sockaddr saddr;
+ } u;
+};
+
+
+
+
+****** AIO LIBRARY *****
+
+glibc:
+/lib64/librt.so.1
+
+glibc-headers:
+/usr/include/aio.h
+
+Provide POSIX-defined interface for async I/O.
+
+aio_read()
+aio_write()
+aio_cancel()
+aio_error()
+aio_fsync()
+aio_suspend()
+aio_return()
+
+lio_listio
+
+
+****** To discover ****
+XXX: see if these are implemented in some other kernels:
+/* These two are experimental.
+ * IOCB_CMD_PREADX = 4,
+ * IOCB_CMD_POLL = 5,
+ */
+XXX: potential resubmittion of the wrong iocb, knowing its index.
+XXX: two AIO contextes per process?
+
+
diff --git a/reference/iron-sosp05.pdf b/reference/iron-sosp05.pdf
new file mode 100644
index 00000000..0359a0ee
--- /dev/null
+++ b/reference/iron-sosp05.pdf
Binary files differ
diff --git a/reference/osdi14-paper-pillai.pdf b/reference/osdi14-paper-pillai.pdf
new file mode 100644
index 00000000..91d27814
--- /dev/null
+++ b/reference/osdi14-paper-pillai.pdf
Binary files differ