Index: chap-one-side/one-side-2.tex
===================================================================
--- chap-one-side/one-side-2.tex	(revision 2082)
+++ chap-one-side/one-side-2.tex	(working copy)
@@ -174,17 +174,33 @@
 process, at window creation.
 
 \begin{rationale}
-The window size is specified using an address-sized integer, to allow windows that span
-more than 4~GB of 
-address space.  (Even if the physical memory size is less than 4~GB, the
-address range may be larger than 4~GB, if addresses are not contiguous.)
+The window size is specified using an address-sized integer
+%% B3.1
+\color{red}%
+%% 3.1Note: Nothing in C says that an int is 4 bytes.  This text is just wrong.
+%, to allow windows that span
+%more than 4~GB of
+%address space.  (Even if the physical memory size is less than 4~GB, the
+%address range may be larger than 4~GB, if addresses are not contiguous.)
+rather than a basic integer type to allow windows that span more memory than
+can be described with a basic integer type.
+%% E3.1
+\color{black}%
 \end{rationale}
 
 \begin{users}
 Common choices for \mpiarg{disp\_unit}
 are 1 (no scaling), and (in C syntax) \code{sizeof(type)}, for a
 window that consists of an array of elements of type \code{type}.  The
-later choice will allow one to use array indices in \RMA/ calls, and have those scaled correctly to byte displacements, even in a heterogeneous environment.
+%% B3.1
+\color{red}%
+%later
+latter
+%% E3.1
+\color{black}%
+choice will allow one to use array indices in \RMA/ calls,
+and have those scaled correctly to byte displacements, even in a
+heterogeneous environment.
 \end{users}
 
 The \mpiarg{info} argument provides
@@ -408,7 +424,11 @@
 
 
 This is a collective call executed by all processes in the group of
-\mpiarg{comm}. On each process $i$, it allocates memory of at least 
+\mpiarg{comm}. On each process%
+%% B3.1
+% $i$
+%% E3.1
+, it allocates memory of at least
 \mpiarg{size} bytes that is shared among all processes in \mpiarg{comm},
 and returns a pointer to
 the locally allocated segment in \mpiarg{baseptr} that can be used for
@@ -520,7 +540,13 @@
 different process-local addresses for the same physical memory on different
 processes. The returned memory can be used for load/store accesses subject to
 the constraints defined in Section~\ref{sec:1sided-semantics}. This function
-can only be called with windows of type
+can only be called with windows of
+%% B3.1
+\color{red}%
+% type
+flavor
+%% E3.1
+\color{black}%
 \mpiarg{MPI\_WIN\_FLAVOR\_SHARED}. If the passed window is not of flavor
 \mpiarg{MPI\_WIN\_FLAVOR\_SHARED}, the error
 \error{MPI\_ERR\_RMA\_FLAVOR} is raised.
@@ -632,7 +658,15 @@
 \begin{users}
 Users are cautioned that displacement arithmetic can overflow in
 variables of type \type{MPI\_Aint} and result in unexpected values on some
-platforms.  This issue may be addressed in a future version of \MPI/.
+platforms.
+%% B3.1
+\color{red}%
+The \mpifunc{MPI\_AINT\_ADD} and \mpifunc{MPI\_AINT\_DIFF}
+functions can be used to safely perform address arithmetic with \type{MPI\_Aint}
+displacements.
+%This issue may be addressed in a future version of \MPI/.
+%% E3.1
+\color{black}%
 \end{users}
 
 \begin{implementors}
@@ -646,9 +680,16 @@
 from any process.
 \end{implementors}
 
-Memory in this window may not be used as the target of one-sided
-accesses in this window until it is attached using the function
-\mpifunc{MPI\_WIN\_ATTACH}.  
+%% B3.1
+\color{red}%
+Memory at the target cannot be accessed with this window until that memory
+has been
+attached using the function \mpifunc{MPI\_WIN\_ATTACH}.
+%Memory in this window may not be used as the target of one-sided
+%accesses in this window until it is attached using the function
+%\mpifunc{MPI\_WIN\_ATTACH}.  
+%% E3.1
+\color{black}%
 That is, in addition to using \mpifunc{MPI\_WIN\_CREATE\_DYNAMIC} to
 create an \MPI/ window, the user must use \mpifunc{MPI\_WIN\_ATTACH}
 before any local memory may be the target of an \MPI/ \RMA/ operation.
@@ -684,8 +725,12 @@
 regions may be attached to the same window.
 
 \begin{rationale}
-Requiring that memory be explicitly attached before it is exposed to 
-one-sided access by other processes can significantly simplify
+Requiring that memory be explicitly attached before it is exposed to
+one-sided access by other processes can
+%% B3.1
+% significantly
+%% E3.1
+simplify
 implementations and improve performance. The ability to make memory
 available for \RMA/ operations without requiring a collective
 \mpifunc{MPI\_WIN\_CREATE} call is needed for some one-sided programming
@@ -1178,8 +1223,15 @@
 is as if the target datatype object was defined at the target process
 by the same sequence of calls used to define it at the origin process.
 The target datatype must 
-contain only relative displacements, not absolute addresses.  The same
-holds for get and accumulate.
+contain only relative displacements, not absolute addresses.
+The same
+holds for get and accumulate
+%% B3.1
+\color{red}%
+operations%
+%% E3.1
+\color{black}%
+.
 
 \begin{users}
 The \mpiarg{target\_datatype} argument is a handle to a datatype
@@ -1206,11 +1258,16 @@
 A high-quality 
 implementation will attempt to
 prevent remote accesses to memory outside the
-window that was exposed by the process.  This, both for debugging
-purposes, and for protection with client-server codes that use \RMA/.
+window that was exposed by the process.
+%% B3.1
+\color{red}%
+This is important both for debugging
+purposes and for protection with client-server codes that use \RMA/.
+%% E3.1
+\color{black}%
 I.e., a high-quality implementation will check, if possible,
 window bounds on each \RMA/ call,
-and raise an \MPI/ exception at the origin call if an out-of-bound 
+and raise an \MPI/ exception at the origin call if an out-of-bound
 situation occurs.
 Note that the condition can be checked at the origin.
 Of course, the added safety achieved by such checks has to be weighed
@@ -1423,11 +1480,17 @@
 \label{sec:1sided-accumulate}
 
 It is often useful in a put operation to combine the data moved to the
-target process with the data that resides at that process, rather 
-then replacing the data there.  This will allow, for example, the
-accumulation of 
+target process with the data that resides at that process, rather
+%% B3.1
+\color{red}%
+%then replacing the data there.
+than replacing it.
+%% E3.1
+\color{black}%
+This will allow, for example, the
+accumulation of
 a sum by having all involved processes add their
-contributions to the 
+contributions to the
 sum variable in the memory of one process.
 The accumulate functions have slightly different
 semantics with respect to overlapping data accesses than
@@ -2555,10 +2618,23 @@
 
 \mpicppemptybind{MPI::Win::Lock(int lock\_type, int rank, int assert) const}{void}
 
-Starts an \RMA/ access epoch.  Only the window at the
+Starts an \RMA/ access epoch.
+%% B3.1
+\color{red}%
+The
+%% E3.1
+\color{black}%
+window at the
 process with rank \mpiarg{rank} can be accessed by \RMA/ operations
 on \mpiarg{win} during that
-epoch. 
+epoch.
+%% B3.1
+\color{red}%
+Multiple \RMA/ access epochs (with calls to \mpifunc{MPI\_Win\_lock})
+can occur simultaneously; however, each access epoch much target a
+different process.
+%% E3.1
+\color{black}%
 
 \begin{funcdef}{MPI\_WIN\_LOCK\_ALL(assert, win)} 
 \funcarg{\IN}{assert}{program assertion (integer)}
@@ -2602,8 +2678,13 @@
 
 \mpicppemptybind{MPI::Win::Unlock(int rank) const}{void}
 
-Completes an \RMA/ access epoch started by a call to 
-\mpifunc{MPI\_WIN\_LOCK($\ldots$,win)}.  \RMA/ operations issued during this
+Completes an \RMA/ access epoch started by a call to
+%% B3.1
+\color{red}%
+\mpifunc{MPI\_WIN\_LOCK} on window \mpiarg{win}.
+%% E3.1
+\color{black}%
+\RMA/ operations issued during this
 period will have completed both at the origin and at the target when the call returns.
 
 \begin{funcdef}{MPI\_WIN\_UNLOCK\_ALL(win)}
@@ -2619,9 +2700,16 @@
 %\mpicppemptybind{MPI::Win::Unlock\_all() const}{void}
 
 Completes a shared \RMA/ access epoch started by a call to
-\mpifunc{MPI\_WIN\_LOCK\_ALL(assert, win)}.  \RMA/ operations issued during this
+%% B3.1
+\color{red}%
+\mpifunc{MPI\_WIN\_LOCK\_ALL} on window \mpiarg{win}.
+%% E3.1
+\color{black}%
+\RMA/ operations issued during this
 epoch will have completed both at the origin and at the target when the call returns.
 
+\medskip%%ALLOWLATEX%%
+
 Locks are used to protect accesses to the locked target
 window effected by \RMA/ calls issued between the lock and unlock
 calls, and to protect
@@ -2680,11 +2768,13 @@
 impose restrictions that allows one to use shared memory for third
 party communication in shared memory machines.
 
-The downside of this decision is that passive target communication cannot be
-used without taking advantage of nonstandard Fortran features: namely,
-the availability of C-like pointers; these are not supported by some
-Fortran
-compilers.
+%% B3.1
+%The downside of this decision is that passive target communication cannot be
+%used without taking advantage of nonstandard Fortran features: namely,
+%the availability of C-like pointers; these are not supported by some
+%Fortran
+%compilers.
+%% E3.1
 \end{rationale}
 
 Consider the sequence of calls in the example below.
@@ -2858,10 +2948,13 @@
 \begin{users}
 C/C++ users can use bit vector or ($\mid$) to combine these constants;
 Fortran 90 users
-can use the bit-vector \code{IOR} intrinsic. 
-Fortran 77 users can use (nonportably)
-bit
-vector \code{IOR} on systems that support it.  Alternatively, Fortran users can 
+can use the bit-vector \code{IOR} intrinsic.
+%% B3.1
+%Fortran 77 users can use (nonportably)
+%bit
+%vector \code{IOR} on systems that support it.
+%% E3.1
+Alternatively, Fortran users can
 portably use integer addition to OR the constants (each constant should
 appear at most once in the addition!).
 \end{users}
@@ -3102,9 +3195,17 @@
 window copy can be delayed in both memory models until the window owner
 executes a synchronization call.
 When passive target 
-synchronization (lock/unlock or even flush) is used, it is necessary to update the public window
-copy in the \RMA/ separate model, or the private window copy in the \RMA/
-unified model, even if the window owner does not execute any related
+synchronization
+%% B3.1
+% (lock/unlock or even flush)
+%% E3.1
+is used, it is necessary to update the public window
+copy%
+%% B3.1
+% in the \RMA/ separate model, or the private window copy in the \RMA/
+%unified model
+%% E3.1
+, even if the window owner does not execute any related
 synchronization call.
 
 The rules above also define, by implication, when an update to a
@@ -3155,13 +3256,16 @@
 must obey the following rules.
 
 \begin{enumerate}
-\item
+%% B3.1
+\def\makelabel#1{\hss\llap{S#1}}
+%% E3.1
+\item\label{rule:s1}
 A location in a window must not be accessed
 with load/store operations once an update to
 that location has started, until the update becomes visible in the
 private window copy in process
-memory. 
-\item
+memory.
+\item\label{rule:s2}
 A location in a window must not  be accessed as a target of an \RMA/
 operation once an update to that location has started, until the
 update becomes visible in the public window copy. There is one
@@ -3170,7 +3274,7 @@
 predefined datatype, on the same window. Additional restrictions on the
 operation apply, see the info key \mpiarg{accumulate\_ops} in
 Section~\ref{chap:one-side-2:win_create}.
-\item
+\item\label{rule:s3}
 A put or accumulate must not access a target window once a
 %load/
 store % update
@@ -3205,18 +3309,25 @@
 (that is, updates to one are made visible to the other).
 
 In the \const{MPI\_WIN\_UNIFIED} memory model, the rules are
-much simpler because the public and private windows are the same.
+%% B3.1
+%much
+%% E3.1
+simpler because the public and private windows are the same.
 However, there are restrictions to avoid concurrent access to
 the same memory locations by different processes.
 The rules that a program with a well-defined outcome must obey in this case are:
 
 \begin{enumerate}
-\item
+%% B3.1
+\def\makelabel#1{\hss\llap{U#1}}
+%% E3.1
+\item\label{rule:u1}
 A location in a window must not be accessed
 with load/store operations once an update to
 that location has started, until the update is complete, 
 subject to the following special case.
-\item Accessing a location in the
+\item\label{rule:u2}
+Accessing a location in the
 window that is also the target of a remote update is valid (not
 erroneous) but the precise result will depend on the behavior of the
 implementation.  Updates from a remote process will appear in the memory of
@@ -3241,7 +3352,8 @@
 may produce unexpected results.
 \end{users}
 
-\item Updating a location in the
+\item\label{rule:u3}
+Updating a location in the
 window with a store operation
   that is also the target of a remote read (but not update) is valid
   (not erroneous) but the precise result will depend on the behavior
@@ -3258,7 +3370,7 @@
   behavior only if the other rules given here and
   elsewhere in this chapter
   are followed.
-\item
+\item\label{rule:u4}
 A location in a window must not be accessed as a
 target of an \RMA/ 
 operation once an update to that location has started and until the
@@ -3268,7 +3380,7 @@
 predefined datatype on the same window. Additional restrictions on the
 operation apply; see the info key \mpiarg{accumulate\_ops} in
 Section~\ref{chap:one-side-2:win_create}.
-\item
+\item\label{rule:u5}
 A put or accumulate must not access a target
 window once a store, put, or
 accumulate update to another (overlapping) target window
@@ -3294,13 +3406,14 @@
 Example~\ref{ex:shmem-sync}.
 \end{users}
 
-Note that \mpifunc{MPI\_WIN\_FLUSH} and \mpifunc{MPI\_WIN\_FLUSH\_ALL}
-may be used within a passive target epoch to complete \RMA/
-operations at the target process. 
+%% B3.1
+%Note that \mpifunc{MPI\_WIN\_FLUSH} and \mpifunc{MPI\_WIN\_FLUSH\_ALL}
+%may be used within a passive target epoch to complete \RMA/
+%operations at the target process.
+%% E3.1
 
 A program that violates these rules has undefined behavior.
 
-
 \begin{users}
 A user can write correct programs by following the following rules:
 \begin{description}
@@ -3689,7 +3802,14 @@
 \mpifunc{MPI\_ACCUMULATE}) are executed and committed in program order.
 Ordering only applies to operations originating at the same origin that
 access overlapping target memory regions. \MPI/ does not provide any
-guarantees for accesses or updates from different origins to overlapping
+guarantees for accesses or updates from different
+%% B3.1
+\color{red}%
+% origins
+origin processes
+%% E3.1
+\color{black}%
+to overlapping
 target memory regions.
 
 The default strict ordering may incur a significant performance penalty.
@@ -3708,7 +3828,13 @@
 whether operations of the specified type complete in the order they were 
 issued.  
 For example, \infoval{raw} means that any writes must complete at the target 
-before any reads.  These ordering requirements apply only to operations issued 
+%% B3.1
+\color{red}%
+%before any reads.
+before subsequent reads.
+%% E3.1
+\color{black}%
+These ordering requirements apply only to operations issued 
 by the same origin process and targeting the same target process.  
 The default value for \infokey{accumulate\_ordering} is 
 \constskip{rar,raw,war,waw}, which implies that writes complete at the target 
@@ -3937,7 +4063,7 @@
     MPI_Put(&frombuf[i], 1, fromtype[i], toneighbor[i],
                          todisp[i], 1, totype[i], win);
   MPI_Win_fence((MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED), win);
-  }
+}
 \end{verbatim}
 The same code could be written with get rather than put.  Note that,
 during the communication phase, each
@@ -3977,7 +4103,7 @@
                     fromdisp[i], 1, fromtype[i], win);
   update_core(A);
   MPI_Win_fence(MPI_MODE_NOSUCCEED, win);
-  }
+}
 \end{verbatim}
 The get communication can be concurrent with the core update, since
 they do not access the same locations, and the local update of the
@@ -4018,7 +4144,7 @@
                          todisp[i], 1, totype[i], win);
   MPI_Win_complete(win);
   MPI_Win_wait(win);
-  }
+}
 \end{verbatim}
 \end{example}
 
@@ -4050,7 +4176,7 @@
   update_core(A);
   MPI_Win_complete(win);
   MPI_Win_wait(win);
-  }
+}
 \end{verbatim}
 \end{example}
 
@@ -4108,7 +4234,7 @@
     MPI_Win_post(neighbors, (MPI_MODE_NOCHECK | MPI_MODE_NOPUT), win0);
   MPI_Win_complete(win1);
   MPI_Win_wait(win1);
-  }
+}
 \end{verbatim}
 
 A process posts the local window associated with
@@ -4372,11 +4498,11 @@
  else
    MPI_Waitany(M, put_req, &j, MPI_STATUS_IGNORE);
 
- MPI_Rget(data[j], N, MPI_DOUBLE, target, i*N, N, MPI_DOUBLE, win, 
+ MPI_Rget(data[j], N, MPI_DOUBLE, target, i*N, N, MPI_DOUBLE, win,
           &get_req);
  MPI_Wait(&get_req,MPI_STATUS_IGNORE);
  compute(i, data[j], ...);
- MPI_Rput(data[j], N, MPI_DOUBLE, target, i*N, N, MPI_DOUBLE, win, 
+ MPI_Rput(data[j], N, MPI_DOUBLE, target, i*N, N, MPI_DOUBLE, win,
           &put_req[j]);
 }
 
@@ -4459,7 +4585,7 @@
   elem_ptr->next  = nil;
   MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
 
-  /* Add the element to the list of local elements so we can free 
+  /* Add the element to the list of local elements so we can free
      it later. */
   if (my_elems_size == my_elems_count) {
     my_elems_size += 100;