summaryrefslogtreecommitdiffstats
path: root/kernel/fs/xfs
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/fs/xfs
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/fs/xfs')
-rw-r--r--kernel/fs/xfs/Makefile4
-rw-r--r--kernel/fs/xfs/kmem.c10
-rw-r--r--kernel/fs/xfs/libxfs/xfs_alloc.c313
-rw-r--r--kernel/fs/xfs/libxfs/xfs_alloc.h18
-rw-r--r--kernel/fs/xfs/libxfs/xfs_alloc_btree.c4
-rw-r--r--kernel/fs/xfs/libxfs/xfs_attr.c33
-rw-r--r--kernel/fs/xfs/libxfs/xfs_attr_leaf.c7
-rw-r--r--kernel/fs/xfs/libxfs/xfs_attr_remote.c11
-rw-r--r--kernel/fs/xfs/libxfs/xfs_bit.c (renamed from kernel/fs/xfs/xfs_bit.c)0
-rw-r--r--kernel/fs/xfs/libxfs/xfs_bmap.c95
-rw-r--r--kernel/fs/xfs/libxfs/xfs_bmap.h13
-rw-r--r--kernel/fs/xfs/libxfs/xfs_bmap_btree.c5
-rw-r--r--kernel/fs/xfs/libxfs/xfs_btree.c31
-rw-r--r--kernel/fs/xfs/libxfs/xfs_btree.h39
-rw-r--r--kernel/fs/xfs/libxfs/xfs_da_btree.c36
-rw-r--r--kernel/fs/xfs/libxfs/xfs_dir2.c42
-rw-r--r--kernel/fs/xfs/libxfs/xfs_dir2_block.c7
-rw-r--r--kernel/fs/xfs/libxfs/xfs_dir2_data.c7
-rw-r--r--kernel/fs/xfs/libxfs/xfs_dir2_leaf.c7
-rw-r--r--kernel/fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--kernel/fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--kernel/fs/xfs/libxfs/xfs_format.h105
-rw-r--r--kernel/fs/xfs/libxfs/xfs_fs.h11
-rw-r--r--kernel/fs/xfs/libxfs/xfs_ialloc.c557
-rw-r--r--kernel/fs/xfs/libxfs/xfs_ialloc.h15
-rw-r--r--kernel/fs/xfs/libxfs/xfs_ialloc_btree.c95
-rw-r--r--kernel/fs/xfs/libxfs/xfs_ialloc_btree.h10
-rw-r--r--kernel/fs/xfs/libxfs/xfs_inode_buf.c24
-rw-r--r--kernel/fs/xfs/libxfs/xfs_sb.c65
-rw-r--r--kernel/fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--kernel/fs/xfs/libxfs/xfs_symlink_remote.c11
-rw-r--r--kernel/fs/xfs/libxfs/xfs_trans_resv.h4
-rw-r--r--kernel/fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--kernel/fs/xfs/xfs_acl.c14
-rw-r--r--kernel/fs/xfs/xfs_acl.h4
-rw-r--r--kernel/fs/xfs/xfs_aops.c221
-rw-r--r--kernel/fs/xfs/xfs_aops.h8
-rw-r--r--kernel/fs/xfs/xfs_attr_inactive.c6
-rw-r--r--kernel/fs/xfs/xfs_attr_list.c2
-rw-r--r--kernel/fs/xfs/xfs_bmap_util.c202
-rw-r--r--kernel/fs/xfs/xfs_buf.c60
-rw-r--r--kernel/fs/xfs/xfs_buf.h3
-rw-r--r--kernel/fs/xfs/xfs_buf_item.c26
-rw-r--r--kernel/fs/xfs/xfs_buf_item.h2
-rw-r--r--kernel/fs/xfs/xfs_dir2_readdir.c13
-rw-r--r--kernel/fs/xfs/xfs_dquot.c32
-rw-r--r--kernel/fs/xfs/xfs_error.c4
-rw-r--r--kernel/fs/xfs/xfs_error.h4
-rw-r--r--kernel/fs/xfs/xfs_extfree_item.c107
-rw-r--r--kernel/fs/xfs/xfs_extfree_item.h26
-rw-r--r--kernel/fs/xfs/xfs_file.c356
-rw-r--r--kernel/fs/xfs/xfs_filestream.c3
-rw-r--r--kernel/fs/xfs/xfs_fsops.c16
-rw-r--r--kernel/fs/xfs/xfs_icache.c18
-rw-r--r--kernel/fs/xfs/xfs_inode.c289
-rw-r--r--kernel/fs/xfs/xfs_inode.h2
-rw-r--r--kernel/fs/xfs/xfs_inode_item.c12
-rw-r--r--kernel/fs/xfs/xfs_inode_item.h1
-rw-r--r--kernel/fs/xfs/xfs_ioctl.c37
-rw-r--r--kernel/fs/xfs/xfs_ioctl32.c2
-rw-r--r--kernel/fs/xfs/xfs_iomap.c88
-rw-r--r--kernel/fs/xfs/xfs_iops.c71
-rw-r--r--kernel/fs/xfs/xfs_itable.c16
-rw-r--r--kernel/fs/xfs/xfs_linux.h21
-rw-r--r--kernel/fs/xfs/xfs_log.c231
-rw-r--r--kernel/fs/xfs/xfs_log.h15
-rw-r--r--kernel/fs/xfs/xfs_log_cil.c20
-rw-r--r--kernel/fs/xfs/xfs_log_priv.h55
-rw-r--r--kernel/fs/xfs/xfs_log_recover.c321
-rw-r--r--kernel/fs/xfs/xfs_message.c7
-rw-r--r--kernel/fs/xfs/xfs_mount.c65
-rw-r--r--kernel/fs/xfs/xfs_mount.h9
-rw-r--r--kernel/fs/xfs/xfs_pnfs.c9
-rw-r--r--kernel/fs/xfs/xfs_qm.c23
-rw-r--r--kernel/fs/xfs/xfs_qm_syscalls.c20
-rw-r--r--kernel/fs/xfs/xfs_quota.h1
-rw-r--r--kernel/fs/xfs/xfs_rtalloc.c71
-rw-r--r--kernel/fs/xfs/xfs_stats.c93
-rw-r--r--kernel/fs/xfs/xfs_stats.h36
-rw-r--r--kernel/fs/xfs/xfs_super.c98
-rw-r--r--kernel/fs/xfs/xfs_symlink.c26
-rw-r--r--kernel/fs/xfs/xfs_sysctl.c15
-rw-r--r--kernel/fs/xfs/xfs_sysfs.c185
-rw-r--r--kernel/fs/xfs/xfs_sysfs.h1
-rw-r--r--kernel/fs/xfs/xfs_trace.h84
-rw-r--r--kernel/fs/xfs/xfs_trans.c112
-rw-r--r--kernel/fs/xfs/xfs_trans.h16
-rw-r--r--kernel/fs/xfs/xfs_trans_ail.c18
-rw-r--r--kernel/fs/xfs/xfs_trans_dquot.c32
-rw-r--r--kernel/fs/xfs/xfs_trans_extfree.c32
-rw-r--r--kernel/fs/xfs/xfs_trans_inode.c9
-rw-r--r--kernel/fs/xfs/xfs_trans_priv.h17
-rw-r--r--kernel/fs/xfs/xfs_xattr.c41
93 files changed, 3291 insertions, 1638 deletions
diff --git a/kernel/fs/xfs/Makefile b/kernel/fs/xfs/Makefile
index df6828570..f64639176 100644
--- a/kernel/fs/xfs/Makefile
+++ b/kernel/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_attr.o \
xfs_attr_leaf.o \
xfs_attr_remote.o \
+ xfs_bit.o \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
@@ -63,7 +64,6 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs-y += xfs_aops.o \
xfs_attr_inactive.o \
xfs_attr_list.o \
- xfs_bit.o \
xfs_bmap_util.o \
xfs_buf.o \
xfs_dir2_readdir.o \
@@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_stats.o \
xfs_super.o \
xfs_symlink.o \
xfs_sysfs.o \
@@ -118,7 +119,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
-xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
diff --git a/kernel/fs/xfs/kmem.c b/kernel/fs/xfs/kmem.c
index a7a3a63bb..686ba6fb2 100644
--- a/kernel/fs/xfs/kmem.c
+++ b/kernel/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
+ current->comm, current->pid,
+ (unsigned int)size, __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
+ __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.c b/kernel/fs/xfs/libxfs/xfs_alloc.c
index 516162be1..3479294c1 100644
--- a/kernel/fs/xfs/libxfs/xfs_alloc.c
+++ b/kernel/fs/xfs/libxfs/xfs_alloc.c
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
{
xfs_agblock_t bno;
xfs_extlen_t len;
+ xfs_extlen_t diff;
/* Trim busy sections out of found extent */
xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+ /*
+ * If we have a largish extent that happens to start before min_agbno,
+ * see if we can shift it into range...
+ */
+ if (bno < args->min_agbno && bno + len > args->min_agbno) {
+ diff = args->min_agbno - bno;
+ if (len > diff) {
+ bno += diff;
+ len -= diff;
+ }
+ }
+
if (args->alignment > 1 && len >= args->minlen) {
xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
- xfs_extlen_t diff = aligned_bno - bno;
+
+ diff = aligned_bno - bno;
*resbno = aligned_bno;
*reslen = diff >= len ? 0 : len - diff;
@@ -450,7 +464,7 @@ xfs_agfl_verify(
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
int i;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
return false;
@@ -468,7 +482,9 @@ xfs_agfl_verify(
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return false;
}
- return true;
+
+ return xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
}
static void
@@ -637,8 +653,8 @@ xfs_alloc_ag_vextent(
-((long)(args->len)));
}
- XFS_STATS_INC(xs_allocx);
- XFS_STATS_ADD(xs_allocb, args->len);
+ XFS_STATS_INC(args->mp, xs_allocx);
+ XFS_STATS_ADD(args->mp, xs_allocb, args->len);
return error;
}
@@ -795,9 +811,13 @@ xfs_alloc_find_best_extent(
* The good extent is closer than this one.
*/
if (!dir) {
+ if (*sbnoa > args->max_agbno)
+ goto out_use_good;
if (*sbnoa >= args->agbno + gdiff)
goto out_use_good;
} else {
+ if (*sbnoa < args->min_agbno)
+ goto out_use_good;
if (*sbnoa <= args->agbno - gdiff)
goto out_use_good;
}
@@ -884,6 +904,17 @@ xfs_alloc_ag_vextent_near(
dofirst = prandom_u32() & 1;
#endif
+ /* handle unitialized agbno range so caller doesn't have to */
+ if (!args->min_agbno && !args->max_agbno)
+ args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+ ASSERT(args->min_agbno <= args->max_agbno);
+
+ /* clamp agbno to the range if it's outside */
+ if (args->agbno < args->min_agbno)
+ args->agbno = args->min_agbno;
+ if (args->agbno > args->max_agbno)
+ args->agbno = args->max_agbno;
+
restart:
bno_cur_lt = NULL;
bno_cur_gt = NULL;
@@ -976,6 +1007,8 @@ restart:
&ltbnoa, &ltlena);
if (ltlena < args->minlen)
continue;
+ if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
+ continue;
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ASSERT(args->len >= args->minlen);
@@ -1096,11 +1129,11 @@ restart:
XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, ltbno, ltlen,
&ltbnoa, &ltlena);
- if (ltlena >= args->minlen)
+ if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
break;
if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
goto error0;
- if (!i) {
+ if (!i || ltbnoa < args->min_agbno) {
xfs_btree_del_cursor(bno_cur_lt,
XFS_BTREE_NOERROR);
bno_cur_lt = NULL;
@@ -1112,11 +1145,11 @@ restart:
XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, gtbno, gtlen,
&gtbnoa, &gtlena);
- if (gtlena >= args->minlen)
+ if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
break;
if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
goto error0;
- if (!i) {
+ if (!i || gtbnoa > args->max_agbno) {
xfs_btree_del_cursor(bno_cur_gt,
XFS_BTREE_NOERROR);
bno_cur_gt = NULL;
@@ -1216,6 +1249,7 @@ restart:
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
args->agbno = ltnew;
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1776,8 +1810,8 @@ xfs_free_ag_extent(
if (!isfl)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
- XFS_STATS_INC(xs_freex);
- XFS_STATS_ADD(xs_freeb, len);
+ XFS_STATS_INC(mp, xs_freex);
+ XFS_STATS_ADD(mp, xs_freeb, len);
trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
@@ -1825,11 +1859,11 @@ xfs_alloc_compute_maxlevels(
xfs_extlen_t
xfs_alloc_longest_free_extent(
struct xfs_mount *mp,
- struct xfs_perag *pag)
+ struct xfs_perag *pag,
+ xfs_extlen_t need)
{
- xfs_extlen_t need, delta = 0;
+ xfs_extlen_t delta = 0;
- need = XFS_MIN_FREELIST_PAG(pag, mp);
if (need > pag->pagf_flcount)
delta = need - pag->pagf_flcount;
@@ -1838,131 +1872,150 @@ xfs_alloc_longest_free_extent(
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
+unsigned int
+xfs_alloc_min_freelist(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ unsigned int min_free;
+
+ /* space needed by-bno freespace btree */
+ min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+ mp->m_ag_maxlevels);
+ /* space needed by-size freespace btree */
+ min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+ mp->m_ag_maxlevels);
+
+ return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t min_free,
+ int flags)
+{
+ struct xfs_perag *pag = args->pag;
+ xfs_extlen_t longest;
+ int available;
+
+ if (flags & XFS_ALLOC_FLAG_FREEING)
+ return true;
+
+ /* do we have enough contiguous free space for the allocation? */
+ longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+ if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+ return false;
+
+ /* do have enough free space remaining for the allocation? */
+ available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+ min_free - args->total);
+ if (available < (int)args->minleft)
+ return false;
+
+ return true;
+}
+
/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
STATIC int /* error */
xfs_alloc_fix_freelist(
- xfs_alloc_arg_t *args, /* allocation argument structure */
- int flags) /* XFS_ALLOC_FLAG_... */
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ int flags) /* XFS_ALLOC_FLAG_... */
{
- xfs_buf_t *agbp; /* agf buffer pointer */
- xfs_agf_t *agf; /* a.g. freespace structure pointer */
- xfs_buf_t *agflbp;/* agfl buffer pointer */
- xfs_agblock_t bno; /* freelist block */
- xfs_extlen_t delta; /* new blocks needed in freelist */
- int error; /* error result code */
- xfs_extlen_t longest;/* longest extent in allocation group */
- xfs_mount_t *mp; /* file system mount point structure */
- xfs_extlen_t need; /* total blocks needed in freelist */
- xfs_perag_t *pag; /* per-ag information structure */
- xfs_alloc_arg_t targs; /* local allocation arguments */
- xfs_trans_t *tp; /* transaction pointer */
-
- mp = args->mp;
+ struct xfs_mount *mp = args->mp;
+ struct xfs_perag *pag = args->pag;
+ struct xfs_trans *tp = args->tp;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_buf *agflbp = NULL;
+ struct xfs_alloc_arg targs; /* local allocation arguments */
+ xfs_agblock_t bno; /* freelist block */
+ xfs_extlen_t need; /* total blocks needed in freelist */
+ int error = 0;
- pag = args->pag;
- tp = args->tp;
if (!pag->pagf_init) {
- if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
- &agbp)))
- return error;
+ error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ if (error)
+ goto out_no_agbp;
if (!pag->pagf_init) {
ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
+ goto out_agbp_relse;
}
- } else
- agbp = NULL;
+ }
/*
- * If this is a metadata preferred pag and we are user data
- * then try somewhere else if we are not being asked to
- * try harder at this point
+ * If this is a metadata preferred pag and we are user data then try
+ * somewhere else if we are not being asked to try harder at this
+ * point
*/
if (pag->pagf_metadata && args->userdata &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
+ goto out_agbp_relse;
}
- if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
- /*
- * If it looks like there isn't a long enough extent, or enough
- * total blocks, reject it.
- */
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- longest = xfs_alloc_longest_free_extent(mp, pag);
- if ((args->minlen + args->alignment + args->minalignslop - 1) >
- longest ||
- ((int)(pag->pagf_freeblks + pag->pagf_flcount -
- need - args->total) < (int)args->minleft)) {
- if (agbp)
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
- }
- }
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags))
+ goto out_agbp_relse;
/*
* Get the a.g. freespace buffer.
* Can fail if we're not blocking on locks, and it's held.
*/
- if (agbp == NULL) {
- if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
- &agbp)))
- return error;
- if (agbp == NULL) {
+ if (!agbp) {
+ error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ if (error)
+ goto out_no_agbp;
+ if (!agbp) {
ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
- }
- }
- /*
- * Figure out how many blocks we should have in the freelist.
- */
- agf = XFS_BUF_TO_AGF(agbp);
- need = XFS_MIN_FREELIST(agf, mp);
- /*
- * If there isn't enough total or single-extent, reject it.
- */
- if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
- delta = need > be32_to_cpu(agf->agf_flcount) ?
- (need - be32_to_cpu(agf->agf_flcount)) : 0;
- longest = be32_to_cpu(agf->agf_longest);
- longest = (longest > delta) ? (longest - delta) :
- (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
- if ((args->minlen + args->alignment + args->minalignslop - 1) >
- longest ||
- ((int)(be32_to_cpu(agf->agf_freeblks) +
- be32_to_cpu(agf->agf_flcount) - need - args->total) <
- (int)args->minleft)) {
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
+ goto out_no_agbp;
}
}
+
+ /* If there isn't enough total space or single-extent, reject it. */
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags))
+ goto out_agbp_relse;
+
/*
* Make the freelist shorter if it's too long.
+ *
+ * Note that from this point onwards, we will always release the agf and
+ * agfl buffers on error. This handles the case where we error out and
+ * the buffers are clean or may not have been joined to the transaction
+ * and hence need to be released manually. If they have been joined to
+ * the transaction, then xfs_trans_brelse() will handle them
+ * appropriately based on the recursion count and dirty state of the
+ * buffer.
+ *
+ * XXX (dgc): When we have lots of free space, does this buy us
+ * anything other than extra overhead when we need to put more blocks
+ * back on the free list? Maybe we should only do this when space is
+ * getting low or the AGFL is more than half full?
*/
- while (be32_to_cpu(agf->agf_flcount) > need) {
- xfs_buf_t *bp;
+ while (pag->pagf_flcount > need) {
+ struct xfs_buf *bp;
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
- return error;
- if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
- return error;
+ goto out_agbp_relse;
+ error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+ if (error)
+ goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
xfs_trans_binval(tp, bp);
}
- /*
- * Initialize the args structure.
- */
+
memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
@@ -1971,21 +2024,20 @@ xfs_alloc_fix_freelist(
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
- if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
- return error;
- /*
- * Make the freelist longer if it's too short.
- */
- while (be32_to_cpu(agf->agf_flcount) < need) {
+ error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+ if (error)
+ goto out_agbp_relse;
+
+ /* Make the freelist longer if it's too short. */
+ while (pag->pagf_flcount < need) {
targs.agbno = 0;
- targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
- /*
- * Allocate as many blocks as possible at once.
- */
- if ((error = xfs_alloc_ag_vextent(&targs))) {
- xfs_trans_brelse(tp, agflbp);
- return error;
- }
+ targs.maxlen = need - pag->pagf_flcount;
+
+ /* Allocate as many blocks as possible at once. */
+ error = xfs_alloc_ag_vextent(&targs);
+ if (error)
+ goto out_agflbp_relse;
+
/*
* Stop if we run out. Won't happen if callers are obeying
* the restrictions correctly. Can happen for free calls
@@ -1994,9 +2046,7 @@ xfs_alloc_fix_freelist(
if (targs.agbno == NULLAGBLOCK) {
if (flags & XFS_ALLOC_FLAG_FREEING)
break;
- xfs_trans_brelse(tp, agflbp);
- args->agbp = NULL;
- return 0;
+ goto out_agflbp_relse;
}
/*
* Put each allocated block on the list.
@@ -2005,12 +2055,21 @@ xfs_alloc_fix_freelist(
error = xfs_alloc_put_freelist(tp, agbp,
agflbp, bno, 0);
if (error)
- return error;
+ goto out_agflbp_relse;
}
}
xfs_trans_brelse(tp, agflbp);
args->agbp = agbp;
return 0;
+
+out_agflbp_relse:
+ xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+ args->agbp = NULL;
+ return error;
}
/*
@@ -2202,9 +2261,13 @@ xfs_agf_verify(
{
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
return false;
+ }
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
@@ -2446,7 +2509,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
- if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
+ if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
@@ -2577,6 +2640,14 @@ xfs_alloc_vextent(
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
args->len);
#endif
+
+ /* Zero the extent if we were asked to do so */
+ if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(args->ip, args->fsbno, args->len);
+ if (error)
+ goto error0;
+ }
+
}
xfs_perag_put(args->pag);
return 0;
diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.h b/kernel/fs/xfs/libxfs/xfs_alloc.h
index d1b4b6a5c..0ecde4d5c 100644
--- a/kernel/fs/xfs/libxfs/xfs_alloc.h
+++ b/kernel/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */
+ struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */
@@ -112,27 +113,28 @@ typedef struct xfs_alloc_arg {
xfs_extlen_t total; /* total blocks needed in xaction */
xfs_extlen_t alignment; /* align answer to multiple of this */
xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
+ xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */
+ xfs_agblock_t max_agbno; /* ... */
xfs_extlen_t len; /* output: actual size of extent */
xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
xfs_alloctype_t otype; /* original allocation type */
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
char isfl; /* set if is freelist blocks - !acctg */
- char userdata; /* set if this is user data */
+ char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t;
/*
* Defines for userdata
*/
-#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
+#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+ struct xfs_perag *pag, xfs_extlen_t need);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
/*
diff --git a/kernel/fs/xfs/libxfs/xfs_alloc_btree.c b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c
index 59d521c09..90de071dd 100644
--- a/kernel/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -295,7 +295,7 @@ xfs_allocbt_verify(
case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
return false;
@@ -313,7 +313,7 @@ xfs_allocbt_verify(
case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
return false;
diff --git a/kernel/fs/xfs/libxfs/xfs_attr.c b/kernel/fs/xfs/libxfs/xfs_attr.c
index 0a472fbe0..f949818fa 100644
--- a/kernel/fs/xfs/libxfs/xfs_attr.c
+++ b/kernel/fs/xfs/libxfs/xfs_attr.c
@@ -125,7 +125,7 @@ xfs_attr_get(
uint lock_mode;
int error;
- XFS_STATS_INC(xs_attr_get);
+ XFS_STATS_INC(ip->i_mount, xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
@@ -139,6 +139,8 @@ xfs_attr_get(
args.value = value;
args.valuelen = *valuelenp;
+ /* Entirely possible to look up a name which doesn't exist */
+ args.op_flags = XFS_DA_OP_OKNOENT;
lock_mode = xfs_ilock_attr_map_shared(ip);
if (!xfs_inode_hasattr(ip))
@@ -207,7 +209,7 @@ xfs_attr_set(
int rsvd = (flags & ATTR_ROOT) != 0;
int error, err2, committed, local;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(mp, xs_attr_set);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -266,7 +268,7 @@ xfs_attr_set(
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
if (error) {
- xfs_trans_cancel(args.trans, 0);
+ xfs_trans_cancel(args.trans);
return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -276,7 +278,7 @@ xfs_attr_set(
XFS_QMOPT_RES_REGBLKS);
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_trans_cancel(args.trans);
return error;
}
@@ -320,8 +322,7 @@ xfs_attr_set(
xfs_trans_ichgtime(args.trans, dp,
XFS_ICHGTIME_CHG);
}
- err2 = xfs_trans_commit(args.trans,
- XFS_TRANS_RELEASE_LOG_RES);
+ err2 = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error ? error : err2;
@@ -383,16 +384,14 @@ xfs_attr_set(
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
out:
- if (args.trans) {
- xfs_trans_cancel(args.trans,
- XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- }
+ if (args.trans)
+ xfs_trans_cancel(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
@@ -413,7 +412,7 @@ xfs_attr_remove(
xfs_fsblock_t firstblock;
int error;
- XFS_STATS_INC(xs_attr_remove);
+ XFS_STATS_INC(mp, xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -462,7 +461,7 @@ xfs_attr_remove(
error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
XFS_ATTRRM_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(args.trans, 0);
+ xfs_trans_cancel(args.trans);
return error;
}
@@ -501,16 +500,14 @@ xfs_attr_remove(
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
out:
- if (args.trans) {
- xfs_trans_cancel(args.trans,
- XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- }
+ if (args.trans)
+ xfs_trans_cancel(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
diff --git a/kernel/fs/xfs/libxfs/xfs_attr_leaf.c b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c
index e9d401ce9..aa187f7ba 100644
--- a/kernel/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -41,6 +41,7 @@
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
#include "xfs_dir2.h"
+#include "xfs_log.h"
/*
@@ -262,10 +263,12 @@ xfs_attr3_leaf_verify(
if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
return false;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
return false;
@@ -1056,7 +1059,7 @@ xfs_attr3_leaf_create(
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
} else {
diff --git a/kernel/fs/xfs/libxfs/xfs_attr_remote.c b/kernel/fs/xfs/libxfs/xfs_attr_remote.c
index dd714037c..5ab95ffa4 100644
--- a/kernel/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/kernel/fs/xfs/libxfs/xfs_attr_remote.c
@@ -100,14 +100,14 @@ xfs_attr3_rmt_verify(
return false;
if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
return false;
- if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(rmt->rm_blkno) != bno)
return false;
if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
return false;
if (be32_to_cpu(rmt->rm_offset) +
- be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+ be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
return false;
if (rmt->rm_owner == 0)
return false;
@@ -222,7 +222,7 @@ xfs_attr3_rmt_hdr_set(
rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
rmt->rm_offset = cpu_to_be32(offset);
rmt->rm_bytes = cpu_to_be32(size);
- uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid);
rmt->rm_owner = cpu_to_be64(ino);
rmt->rm_blkno = cpu_to_be64(bno);
@@ -618,9 +618,8 @@ xfs_attr_rmtval_remove(
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- 1, args->firstblock, args->flist,
- &done);
+ XFS_BMAPI_ATTRFORK, 1, args->firstblock,
+ args->flist, &done);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
diff --git a/kernel/fs/xfs/xfs_bit.c b/kernel/fs/xfs/libxfs/xfs_bit.c
index 0e8885a59..0e8885a59 100644
--- a/kernel/fs/xfs/xfs_bit.c
+++ b/kernel/fs/xfs/libxfs/xfs_bit.c
diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.c b/kernel/fs/xfs/libxfs/xfs_bmap.c
index f1026e86d..119c2422a 100644
--- a/kernel/fs/xfs/libxfs/xfs_bmap.c
+++ b/kernel/fs/xfs/libxfs/xfs_bmap.c
@@ -948,14 +948,16 @@ xfs_bmap_local_to_extents(
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
/*
- * Initialise the block and copy the data
+ * Initialize the block, copy the data and log the remote buffer.
*
- * Note: init_fn must set the buffer log item type correctly!
+ * The callout is responsible for logging because the remote format
+ * might differ from the local format and thus we don't know how much to
+ * log here. Note that init_fn must also set the buffer log item type
+ * correctly.
*/
init_fn(tp, bp, ip, ifp);
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ /* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
@@ -1112,7 +1114,6 @@ xfs_bmap_add_attrfork(
int committed; /* xaction was committed */
int logflags; /* logging flags */
int error; /* error return value */
- int cancel_flags = 0;
ASSERT(XFS_IFORK_Q(ip) == 0);
@@ -1124,17 +1125,15 @@ xfs_bmap_add_attrfork(
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
if (error)
goto trans_cancel;
- cancel_flags |= XFS_TRANS_ABORT;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -1218,14 +1217,14 @@ xfs_bmap_add_attrfork(
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
goto bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
bmap_cancel:
xfs_bmap_cancel(&flist);
trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1438,7 +1437,7 @@ xfs_bmap_search_extents(
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- XFS_STATS_INC(xs_look_exlist);
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
ifp = XFS_IFORK_PTR(ip, fork);
ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
@@ -1735,7 +1734,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -2289,7 +2288,7 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
ASSERT(!isnullstartblock(new->br_startblock));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -2949,7 +2948,7 @@ xfs_bmap_add_extent_hole_real(
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
state = 0;
if (whichfork == XFS_ATTR_FORK)
@@ -3521,7 +3520,8 @@ xfs_bmap_longest_free_extent(
}
}
- longest = xfs_alloc_longest_free_extent(mp, pag);
+ longest = xfs_alloc_longest_free_extent(mp, pag,
+ xfs_alloc_min_freelist(mp, pag));
if (*blen < longest)
*blen = longest;
@@ -3802,8 +3802,13 @@ xfs_bmap_btalloc(
args.wasdel = ap->wasdel;
args.isfl = 0;
args.userdata = ap->userdata;
- if ((error = xfs_alloc_vextent(&args)))
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+ args.ip = ap->ip;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
return error;
+
if (tryagain && args.fsbno == NULLFSBLOCK) {
/*
* Exact allocation failed. Now try with alignment
@@ -4038,7 +4043,7 @@ xfs_bmapi_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapr);
+ XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4223,7 +4228,7 @@ xfs_bmapi_delay(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
@@ -4302,11 +4307,14 @@ xfs_bmapi_allocate(
/*
* Indicate if this is the first user data in the file, or just any
- * user data.
+ * user data. And if it is userdata, indicate whether it needs to
+ * be initialised to zero during allocation.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ if (bma->flags & XFS_BMAPI_ZERO)
+ bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4421,10 +4429,29 @@ xfs_bmapi_convert_unwritten(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+ /*
+ * Before insertion into the bmbt, zero the range being converted
+ * if required.
+ */
+ if (flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, mval->br_startblock,
+ mval->br_blockcount);
+ if (error)
+ return error;
+ }
+
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags);
- bma->logflags |= tmp_logflags;
+ /*
+ * Log the inode core unconditionally in the unwritten extent conversion
+ * path because the conversion might not have done so (e.g., if the
+ * extent count hasn't changed). We need to make sure the inode is dirty
+ * in the transaction for the sake of fsync(), even if nothing has
+ * changed, because fsync() will not force the log for this transaction
+ * unless it sees the inode pinned.
+ */
+ bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
if (error)
return error;
@@ -4506,6 +4533,18 @@ xfs_bmapi_write(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /* zeroing is for currently only for data extents, not metadata */
+ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+ /*
+ * we can allocate unwritten extents or pre-zero allocated blocks,
+ * but it makes no sense to do both at once. This would result in
+ * zeroing the unwritten extent twice, but it still being an
+ * unwritten extent....
+ */
+ ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
@@ -4519,7 +4558,7 @@ xfs_bmapi_write(
ifp = XFS_IFORK_PTR(ip, whichfork);
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
@@ -4712,12 +4751,12 @@ xfs_bmap_del_extent(
xfs_filblks_t temp2; /* for indirect length calculations */
int state = 0;
- XFS_STATS_INC(xs_del_exlist);
+ mp = ip->i_mount;
+ XFS_STATS_INC(mp, xs_del_exlist);
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
- mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)));
@@ -5064,7 +5103,7 @@ xfs_bunmapi(
*done = 1;
return 0;
}
- XFS_STATS_INC(xs_blk_unmap);
+ XFS_STATS_INC(mp, xs_blk_unmap);
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
start = bno;
bno = start + len - 1;
@@ -5918,7 +5957,7 @@ xfs_bmap_split_extent(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -5936,10 +5975,10 @@ xfs_bmap_split_extent(
if (error)
goto out;
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
+ return xfs_trans_commit(tp);
out:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.h b/kernel/fs/xfs/libxfs/xfs_bmap.h
index 6aaa0c1c7..a160f8a5a 100644
--- a/kernel/fs/xfs/libxfs/xfs_bmap.h
+++ b/kernel/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
xfs_extlen_t minleft; /* amount must be left after alloc */
bool eof; /* set if allocating past last extent */
bool wasdel; /* replacing a delayed allocation */
- bool userdata;/* set if is user data */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
+ char userdata;/* userdata mask */
int flags;
};
@@ -109,6 +109,14 @@ typedef struct xfs_bmap_free
*/
#define XFS_BMAPI_CONVERT 0x040
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO 0x080
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -116,7 +124,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_ZERO, "ZERO" }
static inline int xfs_bmapi_aflag(int w)
diff --git a/kernel/fs/xfs/libxfs/xfs_bmap_btree.c b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c
index 2c44c8e50..6b0cf6546 100644
--- a/kernel/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -349,7 +349,8 @@ xfs_bmbt_to_bmdr(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
- ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
+ &mp->m_sb.sb_meta_uuid));
ASSERT(rblock->bb_u.l.bb_blkno ==
cpu_to_be64(XFS_BUF_DADDR_NULL));
} else
@@ -647,7 +648,7 @@ xfs_bmbt_verify(
case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
return false;
diff --git a/kernel/fs/xfs/libxfs/xfs_btree.c b/kernel/fs/xfs/libxfs/xfs_btree.c
index c72283dd8..af1bbee55 100644
--- a/kernel/fs/xfs/libxfs/xfs_btree.c
+++ b/kernel/fs/xfs/libxfs/xfs_btree.c
@@ -32,6 +32,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_alloc.h"
+#include "xfs_log.h"
/*
* Cursor allocation zone.
@@ -65,7 +66,8 @@ xfs_btree_check_lblock(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
lblock_ok = lblock_ok &&
- uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+ uuid_equal(&block->bb_u.l.bb_uuid,
+ &mp->m_sb.sb_meta_uuid) &&
block->bb_u.l.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
@@ -115,7 +117,8 @@ xfs_btree_check_sblock(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
sblock_ok = sblock_ok &&
- uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+ uuid_equal(&block->bb_u.s.bb_uuid,
+ &mp->m_sb.sb_meta_uuid) &&
block->bb_u.s.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
@@ -220,7 +223,7 @@ xfs_btree_check_ptr(
* long-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -241,8 +244,14 @@ bool
xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+ }
return true;
}
@@ -252,7 +261,7 @@ xfs_btree_lblock_verify_crc(
* short-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -273,8 +282,14 @@ bool
xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+ }
return true;
}
@@ -1000,7 +1015,7 @@ xfs_btree_init_block_int(
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.l.bb_owner = cpu_to_be64(owner);
- uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.l.bb_pad = 0;
buf->bb_u.l.bb_lsn = 0;
}
@@ -1013,7 +1028,7 @@ xfs_btree_init_block_int(
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
- uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.s.bb_lsn = 0;
}
}
diff --git a/kernel/fs/xfs/libxfs/xfs_btree.h b/kernel/fs/xfs/libxfs/xfs_btree.h
index 8f18bab73..992dec063 100644
--- a/kernel/fs/xfs/libxfs/xfs_btree.h
+++ b/kernel/fs/xfs/libxfs/xfs_btree.h
@@ -84,31 +84,38 @@ union xfs_btree_rec {
/*
* Generic stats interface
*/
-#define __XFS_BTREE_STATS_INC(type, stat) \
- XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
-#define XFS_BTREE_STATS_INC(cur, stat) \
+#define __XFS_BTREE_STATS_INC(mp, type, stat) \
+ XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
-#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
+ XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ case XFS_BTNUM_BNO: \
+ __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: \
+ __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: \
+ __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: \
+ __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: \
+ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
diff --git a/kernel/fs/xfs/libxfs/xfs_da_btree.c b/kernel/fs/xfs/libxfs/xfs_da_btree.c
index 2385f8cd0..e89a0f8f8 100644
--- a/kernel/fs/xfs/libxfs/xfs_da_btree.c
+++ b/kernel/fs/xfs/libxfs/xfs_da_btree.c
@@ -39,6 +39,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
* xfs_da_btree.c
@@ -146,10 +147,12 @@ xfs_da3_node_verify(
if (ichdr.magic != XFS_DA3_NODE_MAGIC)
return false;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_DA_NODE_MAGIC)
return false;
@@ -233,6 +236,7 @@ xfs_da3_node_read_verify(
bp->b_ops->verify_read(bp);
return;
default:
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
break;
}
@@ -321,10 +325,11 @@ xfs_da3_node_create(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(bp->b_bn);
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
- uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
ichdr.magic = XFS_DA_NODE_MAGIC;
}
@@ -1822,6 +1827,7 @@ xfs_da3_path_shift(
struct xfs_da_args *args;
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_buf *bp;
xfs_dablk_t blkno = 0;
int level;
int error;
@@ -1866,20 +1872,24 @@ xfs_da3_path_shift(
*/
for (blk++, level++; level < path->active; blk++, level++) {
/*
- * Release the old block.
- * (if it's dirty, trans won't actually let go)
+ * Read the next child block into a local buffer.
*/
- if (release)
- xfs_trans_brelse(args->trans, blk->bp);
+ error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
+ args->whichfork);
+ if (error)
+ return error;
/*
- * Read the next child block.
+ * Release the old block (if it's dirty, the trans doesn't
+ * actually let go) and swap the local buffer into the path
+ * structure. This ensures failure of the above read doesn't set
+ * a NULL buffer in an active slot in the path.
*/
+ if (release)
+ xfs_trans_brelse(args->trans, blk->bp);
blk->blkno = blkno;
- error = xfs_da3_node_read(args->trans, dp, blkno, -1,
- &blk->bp, args->whichfork);
- if (error)
- return error;
+ blk->bp = bp;
+
info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2351,8 +2361,8 @@ xfs_da_shrink_inode(
* the last block to the place we want to kill.
*/
error = xfs_bunmapi(tp, dp, dead_blkno, count,
- xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- 0, args->firstblock, args->flist, &done);
+ xfs_bmapi_aflag(w), 0, args->firstblock,
+ args->flist, &done);
if (error == -ENOSPC) {
if (w != XFS_DATA_FORK)
break;
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2.c b/kernel/fs/xfs/libxfs/xfs_dir2.c
index a69fb3a1e..2fb53a5c0 100644
--- a/kernel/fs/xfs/libxfs/xfs_dir2.c
+++ b/kernel/fs/xfs/libxfs/xfs_dir2.c
@@ -271,7 +271,7 @@ xfs_dir_createname(
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
return rval;
- XFS_STATS_INC(xs_dir_create);
+ XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -362,9 +362,10 @@ xfs_dir_lookup(
struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
+ int lock_mode;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_lookup);
+ XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
/*
* We need to use KM_NOFS here so that lockdep will not throw false
@@ -387,6 +388,7 @@ xfs_dir_lookup(
if (ci_name)
args->op_flags |= XFS_DA_OP_CILOOKUP;
+ lock_mode = xfs_ilock_data_map_shared(dp);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_lookup(args);
goto out_check_rval;
@@ -419,6 +421,7 @@ out_check_rval:
}
}
out_free:
+ xfs_iunlock(dp, lock_mode);
kmem_free(args);
return rval;
}
@@ -441,7 +444,7 @@ xfs_dir_removename(
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_remove);
+ XFS_STATS_INC(dp->i_mount, xs_dir_remove);
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
@@ -674,25 +677,22 @@ xfs_dir2_shrink_inode(
mp = dp->i_mount;
tp = args->trans;
da = xfs_dir2_db_to_da(args->geo, db);
- /*
- * Unmap the fsblock(s).
- */
- if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
- XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
- &done))) {
+
+ /* Unmap the fsblock(s). */
+ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0,
+ args->firstblock, args->flist, &done);
+ if (error) {
/*
- * ENOSPC actually can happen if we're in a removename with
- * no space reservation, and the resulting block removal
- * would cause a bmap btree split or conversion from extents
- * to btree. This can only happen for un-fragmented
- * directory blocks, since you need to be punching out
- * the middle of an extent.
- * In this case we need to leave the block in the file,
- * and not binval it.
- * So the block has to be in a consistent empty state
- * and appropriately logged.
- * We don't free up the buffer, the caller can tell it
- * hasn't happened since it got an error back.
+ * ENOSPC actually can happen if we're in a removename with no
+ * space reservation, and the resulting block removal would
+ * cause a bmap btree split or conversion from extents to btree.
+ * This can only happen for un-fragmented directory blocks,
+ * since you need to be punching out the middle of an extent.
+ * In this case we need to leave the block in the file, and not
+ * binval it. So the block has to be in a consistent empty
+ * state and appropriately logged. We don't free up the buffer,
+ * the caller can tell it hasn't happened since it got an error
+ * back.
*/
return error;
}
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_block.c b/kernel/fs/xfs/libxfs/xfs_dir2_block.c
index 9354e190b..9c10e2b8c 100644
--- a/kernel/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_block.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function prototypes.
@@ -67,10 +68,12 @@ xfs_dir3_block_verify(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
return false;
@@ -157,7 +160,7 @@ xfs_dir3_block_init(
hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
return;
}
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_data.c b/kernel/fs/xfs/libxfs/xfs_dir2_data.c
index 534bbf283..af71a84f3 100644
--- a/kernel/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_data.c
@@ -31,6 +31,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Check the consistency of the data block.
@@ -220,10 +221,12 @@ xfs_dir3_data_verify(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
return false;
@@ -605,7 +608,7 @@ xfs_dir3_data_init(
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c
index 106119955..3923e1f94 100644
--- a/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function declarations.
@@ -160,10 +161,12 @@ xfs_dir3_leaf_verify(
if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
return false;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
+ return false;
} else {
if (leaf->hdr.info.magic != cpu_to_be16(magic))
return false;
@@ -310,7 +313,7 @@ xfs_dir3_leaf_init(
: cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
leaf3->info.blkno = cpu_to_be64(bp->b_bn);
leaf3->info.owner = cpu_to_be64(owner);
- uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
memset(leaf, 0, sizeof(*leaf));
leaf->hdr.info.magic = cpu_to_be16(type);
diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_node.c b/kernel/fs/xfs/libxfs/xfs_dir2_node.c
index 06bb4218b..70b0cb2fd 100644
--- a/kernel/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/kernel/fs/xfs/libxfs/xfs_dir2_node.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Function declarations.
@@ -93,10 +94,12 @@ xfs_dir3_free_verify(
if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
return false;
@@ -226,7 +229,7 @@ xfs_dir3_free_get_buf(
hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
@@ -1845,8 +1848,7 @@ xfs_dir2_node_addname_int(
if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
xfs_alert(mp,
- "%s: dir ino %llu needed freesp block %lld for\n"
- " data block %lld, got %lld ifbno %llu lastfbno %d",
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
__func__, (unsigned long long)dp->i_ino,
(long long)dp->d_ops->db_to_fdb(
args->geo, dbno),
diff --git a/kernel/fs/xfs/libxfs/xfs_dquot_buf.c b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c
index 6fbf2d853..5331b7f04 100644
--- a/kernel/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -163,7 +163,7 @@ xfs_dqcheck(
d->dd_diskdq.d_id = cpu_to_be32(id);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -198,7 +198,7 @@ xfs_dquot_buf_verify_crc(
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF))
return false;
- if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid))
return false;
}
return true;
diff --git a/kernel/fs/xfs/libxfs/xfs_format.h b/kernel/fs/xfs/libxfs/xfs_format.h
index 4daaa6623..e2536bb1c 100644
--- a/kernel/fs/xfs/libxfs/xfs_format.h
+++ b/kernel/fs/xfs/libxfs/xfs_format.h
@@ -60,6 +60,14 @@ struct xfs_ifork;
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
/*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
* Supported feature bit list is just all bits in the versionnum field because
* we've used them all up and understand them all. Except, of course, for the
* shared superblock bit, which nobody knows what it does and so is unsupported.
@@ -100,7 +108,7 @@ typedef struct xfs_sb {
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
xfs_rtblock_t sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
+ uuid_t sb_uuid; /* user-visible file system unique id */
xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
@@ -170,10 +178,11 @@ typedef struct xfs_sb {
__uint32_t sb_features_log_incompat;
__uint32_t sb_crc; /* superblock crc */
- __uint32_t sb_pad;
+ xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */
xfs_ino_t sb_pquotino; /* project quota inode */
xfs_lsn_t sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -190,7 +199,7 @@ typedef struct xfs_dsb {
__be64 sb_dblocks; /* number of data blocks */
__be64 sb_rblocks; /* number of realtime blocks */
__be64 sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
+ uuid_t sb_uuid; /* user-visible file system unique id */
__be64 sb_logstart; /* starting block of log if internal */
__be64 sb_rootino; /* root inode number */
__be64 sb_rbmino; /* bitmap inode for realtime extents */
@@ -256,10 +265,11 @@ typedef struct xfs_dsb {
__be32 sb_features_log_incompat;
__le32 sb_crc; /* superblock crc */
- __be32 sb_pad;
+ __be32 sb_spino_align; /* sparse inode chunk alignment */
__be64 sb_pquotino; /* project quota inode */
__be64 sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
} xfs_dsb_t;
@@ -457,8 +467,12 @@ xfs_sb_has_ro_compat_feature(
}
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_ALL \
- (XFS_SB_FEAT_INCOMPAT_FTYPE)
+ (XFS_SB_FEAT_INCOMPAT_FTYPE| \
+ XFS_SB_FEAT_INCOMPAT_SPINODES| \
+ XFS_SB_FEAT_INCOMPAT_META_UUID)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -506,6 +520,24 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
}
+static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
+}
+
+/*
+ * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID
+ * is stored separately from the user-visible UUID; this allows the
+ * user-visible UUID to be changed on V5 filesystems which have a
+ * filesystem UUID stamped into every piece of metadata.
+ */
+static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
+}
+
/*
* end of superblock version macros
*/
@@ -754,23 +786,10 @@ typedef struct xfs_agfl {
__be64 agfl_lsn;
__be32 agfl_crc;
__be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+} __attribute__((packed)) xfs_agfl_t;
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
-
-#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
-#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
- (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define XFS_MIN_FREELIST(a,mp) \
- (XFS_MIN_FREELIST_RAW( \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define XFS_MIN_FREELIST_PAG(pag,mp) \
- (XFS_MIN_FREELIST_RAW( \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
#define XFS_AGB_TO_FSB(mp,agno,agbno) \
(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
#define XFS_FSB_TO_AGNO(mp,fsbno) \
@@ -1216,26 +1235,54 @@ typedef __uint64_t xfs_inofree_t;
#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT \
+ (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
+
static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
{
return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
}
/*
- * Data record structure
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
*/
typedef struct xfs_inobt_rec {
__be32 ir_startino; /* starting inode number */
- __be32 ir_freecount; /* count of free inodes (set bits) */
+ union {
+ struct {
+ __be32 ir_freecount; /* count of free inodes */
+ } f;
+ struct {
+ __be16 ir_holemask;/* hole mask for sparse chunks */
+ __u8 ir_count; /* total inode count */
+ __u8 ir_freecount; /* count of free inodes */
+ } sp;
+ } ir_u;
__be64 ir_free; /* free inode mask */
} xfs_inobt_rec_t;
typedef struct xfs_inobt_rec_incore {
xfs_agino_t ir_startino; /* starting inode number */
- __int32_t ir_freecount; /* count of free inodes (set bits) */
+ __uint16_t ir_holemask; /* hole mask for sparse chunks */
+ __uint8_t ir_count; /* total inode count */
+ __uint8_t ir_freecount; /* count of free inodes (set bits) */
xfs_inofree_t ir_free; /* free inode mask */
} xfs_inobt_rec_incore_t;
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+ /* non-zero holemask represents a sparse rec. */
+ return holemask;
+}
/*
* Key structure
@@ -1444,17 +1491,21 @@ struct xfs_acl {
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
(xfs_sb_version_hascrc(&mp->m_sb) \
- ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
-#define XFS_ACL_MAX_SIZE(mp) \
+#define XFS_ACL_SIZE(cnt) \
(sizeof(struct xfs_acl) + \
- sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+ sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE "SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
diff --git a/kernel/fs/xfs/libxfs/xfs_fs.h b/kernel/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca..b2b73a998 100644
--- a/kernel/fs/xfs/libxfs/xfs_fs.h
+++ b/kernel/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
/*
* Minimum and maximum sizes need for growth checks.
@@ -489,6 +490,16 @@ typedef struct xfs_swapext
#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
/*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+# define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
* ioctl commands that are used by Linux filesystems
*/
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.c b/kernel/fs/xfs/libxfs/xfs_ialloc.c
index 1c9e75521..70c1db99f 100644
--- a/kernel/fs/xfs/libxfs/xfs_ialloc.c
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
#include "xfs_icreate_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
/*
@@ -65,6 +66,8 @@ xfs_inobt_lookup(
int *stat) /* success/failure */
{
cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_holemask = 0;
+ cur->bc_rec.i.ir_count = 0;
cur->bc_rec.i.ir_freecount = 0;
cur->bc_rec.i.ir_free = 0;
return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +85,14 @@ xfs_inobt_update(
union xfs_btree_rec rec;
rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
- rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+ rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+ rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ }
rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
return xfs_btree_update(cur, &rec);
}
@@ -100,12 +110,27 @@ xfs_inobt_get_rec(
int error;
error = xfs_btree_get_rec(cur, &rec, stat);
- if (!error && *stat == 1) {
- irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
- irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
- irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+ if (error || *stat == 0)
+ return error;
+
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+ irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+ irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+ } else {
+ /*
+ * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+ * values for full inode chunks.
+ */
+ irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+ irec->ir_count = XFS_INODES_PER_CHUNK;
+ irec->ir_freecount =
+ be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
}
- return error;
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+
+ return 0;
}
/*
@@ -114,10 +139,14 @@ xfs_inobt_get_rec(
STATIC int
xfs_inobt_insert_rec(
struct xfs_btree_cur *cur,
+ __uint16_t holemask,
+ __uint8_t count,
__int32_t freecount,
xfs_inofree_t free,
int *stat)
{
+ cur->bc_rec.i.ir_holemask = holemask;
+ cur->bc_rec.i.ir_count = count;
cur->bc_rec.i.ir_freecount = freecount;
cur->bc_rec.i.ir_free = free;
return xfs_btree_insert(cur, stat);
@@ -154,7 +183,9 @@ xfs_inobt_insert(
}
ASSERT(i == 0);
- error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+ XFS_INODES_PER_CHUNK,
+ XFS_INODES_PER_CHUNK,
XFS_INOBT_ALL_FREE, &i);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +251,7 @@ xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct list_head *buffer_list,
+ int icount,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_agblock_t length,
@@ -275,7 +307,7 @@ xfs_ialloc_inode_init(
* they track in the AIL as if they were physically logged.
*/
if (tp)
- xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+ xfs_icreate_log(tp, agno, agbno, icount,
mp->m_sb.sb_inodesize, length, gen);
} else
version = 2;
@@ -307,7 +339,8 @@ xfs_ialloc_inode_init(
if (version == 3) {
free->di_ino = cpu_to_be64(ino);
ino++;
- uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&free->di_uuid,
+ &mp->m_sb.sb_meta_uuid);
xfs_dinode_calc_crc(mp, free);
} else if (tp) {
/* just log the inode core */
@@ -347,6 +380,214 @@ xfs_ialloc_inode_init(
}
/*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+ struct xfs_mount *mp,
+ xfs_agino_t *startino,
+ uint16_t *allocmask)
+{
+ xfs_agblock_t agbno;
+ xfs_agblock_t mod;
+ int offset;
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+ mod = agbno % mp->m_sb.sb_inoalignmt;
+ if (!mod)
+ return;
+
+ /* calculate the inode offset and align startino */
+ offset = mod << mp->m_sb.sb_inopblog;
+ *startino -= offset;
+
+ /*
+ * Since startino has been aligned down, left shift allocmask such that
+ * it continues to represent the same physical inodes relative to the
+ * new startino.
+ */
+ *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+ struct xfs_inobt_rec_incore *trec, /* tgt record */
+ struct xfs_inobt_rec_incore *srec) /* src record */
+{
+ uint64_t talloc;
+ uint64_t salloc;
+
+ /* records must cover the same inode range */
+ if (trec->ir_startino != srec->ir_startino)
+ return false;
+
+ /* both records must be sparse */
+ if (!xfs_inobt_issparse(trec->ir_holemask) ||
+ !xfs_inobt_issparse(srec->ir_holemask))
+ return false;
+
+ /* both records must track some inodes */
+ if (!trec->ir_count || !srec->ir_count)
+ return false;
+
+ /* can't exceed capacity of a full record */
+ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+ return false;
+
+ /* verify there is no allocation overlap */
+ talloc = xfs_inobt_irec_to_allocmask(trec);
+ salloc = xfs_inobt_irec_to_allocmask(srec);
+ if (talloc & salloc)
+ return false;
+
+ return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+ struct xfs_inobt_rec_incore *trec, /* target */
+ struct xfs_inobt_rec_incore *srec) /* src */
+{
+ ASSERT(trec->ir_startino == srec->ir_startino);
+
+ /* combine the counts */
+ trec->ir_count += srec->ir_count;
+ trec->ir_freecount += srec->ir_freecount;
+
+ /*
+ * Merge the holemask and free mask. For both fields, 0 bits refer to
+ * allocated inodes. We combine the allocated ranges with bitwise AND.
+ */
+ trec->ir_holemask &= srec->ir_holemask;
+ trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ int btnum,
+ struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
+ bool merge) /* merge or replace */
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ int error;
+ int i;
+ struct xfs_inobt_rec_incore rec;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, insert a new record and return */
+ if (i == 0) {
+ error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+ nrec->ir_count, nrec->ir_freecount,
+ nrec->ir_free, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+
+ goto out;
+ }
+
+ /*
+ * A record exists at this startino. Merge or replace the record
+ * depending on what we've been asked to do.
+ */
+ if (merge) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ rec.ir_startino == nrec->ir_startino,
+ error);
+
+ /*
+ * This should never fail. If we have coexisting records that
+ * cannot merge, something is seriously wrong.
+ */
+ XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
+ error);
+
+ trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+ rec.ir_holemask, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ /* merge to nrec to output the updated record */
+ __xfs_inobt_rec_merge(nrec, &rec);
+
+ trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ error = xfs_inobt_rec_check_count(mp, nrec);
+ if (error)
+ goto error;
+ }
+
+ error = xfs_inobt_update(cur, nrec);
+ if (error)
+ goto error;
+
+out:
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -364,11 +605,22 @@ xfs_ialloc_ag_alloc(
xfs_agino_t newlen; /* new number of inodes */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
+ uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
+ struct xfs_inobt_rec_incore rec;
struct xfs_perag *pag;
+ int do_sparse = 0;
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
+ args.fsbno = NULLFSBLOCK;
+
+#ifdef DEBUG
+ /* randomly do sparse inode allocations */
+ if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+ args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+ do_sparse = prandom_u32() & 1;
+#endif
/*
* Locking will ensure that we don't have two callers in here
@@ -390,6 +642,8 @@ xfs_ialloc_ag_alloc(
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
args.mp->m_ialloc_blks;
+ if (do_sparse)
+ goto sparse_alloc;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -428,8 +682,7 @@ xfs_ialloc_ag_alloc(
* subsequent requests.
*/
args.minalignslop = 0;
- } else
- args.fsbno = NULLFSBLOCK;
+ }
if (unlikely(args.fsbno == NULLFSBLOCK)) {
/*
@@ -480,6 +733,47 @@ xfs_ialloc_ag_alloc(
return error;
}
+ /*
+ * Finally, try a sparse allocation if the filesystem supports it and
+ * the sparse allocation length is smaller than a full chunk.
+ */
+ if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+ args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+ args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.alignment = args.mp->m_sb.sb_spino_align;
+ args.prod = 1;
+
+ args.minlen = args.mp->m_ialloc_min_blks;
+ args.maxlen = args.minlen;
+
+ /*
+ * The inode record will be aligned to full chunk size. We must
+ * prevent sparse allocation from AG boundaries that result in
+ * invalid inode records, such as records that start at agbno 0
+ * or extend beyond the AG.
+ *
+ * Set min agbno to the first aligned, non-zero agbno and max to
+ * the last aligned agbno that is at least one full chunk from
+ * the end of the AG.
+ */
+ args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+ args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+ args.mp->m_sb.sb_inoalignmt) -
+ args.mp->m_ialloc_blks;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ newlen = args.len << args.mp->m_sb.sb_inopblog;
+ ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+ allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+ }
+
if (args.fsbno == NULLFSBLOCK) {
*alloc = 0;
return 0;
@@ -495,8 +789,8 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
- args.len, prandom_u32());
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
+ args.agbno, args.len, prandom_u32());
if (error)
return error;
@@ -504,6 +798,73 @@ xfs_ialloc_ag_alloc(
* Convert the results.
*/
newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+
+ if (xfs_inobt_issparse(~allocmask)) {
+ /*
+ * We've allocated a sparse chunk. Align the startino and mask.
+ */
+ xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+ rec.ir_startino = newino;
+ rec.ir_holemask = ~allocmask;
+ rec.ir_count = newlen;
+ rec.ir_freecount = newlen;
+ rec.ir_free = XFS_INOBT_ALL_FREE;
+
+ /*
+ * Insert the sparse record into the inobt and allow for a merge
+ * if necessary. If a merge does occur, rec is updated to the
+ * merged record.
+ */
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
+ &rec, true);
+ if (error == -EFSCORRUPTED) {
+ xfs_alert(args.mp,
+ "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+ XFS_AGINO_TO_INO(args.mp, agno,
+ rec.ir_startino),
+ rec.ir_holemask, rec.ir_count);
+ xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ if (error)
+ return error;
+
+ /*
+ * We can't merge the part we've just allocated as for the inobt
+ * due to finobt semantics. The original record may or may not
+ * exist independent of whether physical inodes exist in this
+ * sparse chunk.
+ *
+ * We must update the finobt record based on the inobt record.
+ * rec contains the fully merged and up to date inobt record
+ * from the previous call. Set merge false to replace any
+ * existing record with this one.
+ */
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
+ XFS_BTNUM_FINO, &rec,
+ false);
+ if (error)
+ return error;
+ }
+ } else {
+ /* full chunk - insert new records to both btrees */
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+ newlen, XFS_BTNUM_FINO);
+ if (error)
+ return error;
+ }
+ }
+
+ /*
+ * Update AGI counts and newino.
+ */
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
pag = xfs_perag_get(args.mp, agno);
@@ -512,20 +873,6 @@ xfs_ialloc_ag_alloc(
agi->agi_newino = cpu_to_be32(newino);
/*
- * Insert records describing the new inode chunk into the btrees.
- */
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_INO);
- if (error)
- return error;
-
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_FINO);
- if (error)
- return error;
- }
- /*
* Log allocation group header fields
*/
xfs_ialloc_log_agi(tp, agbp,
@@ -645,7 +992,7 @@ xfs_ialloc_ag_select(
* if we fail allocation due to alignment issues then it is most
* likely a real ENOSPC condition.
*/
- ineed = mp->m_ialloc_blks;
+ ineed = mp->m_ialloc_min_blks;
if (flags && ineed > 1)
ineed += xfs_ialloc_cluster_alignment(mp);
longest = pag->pagf_longest;
@@ -732,6 +1079,27 @@ xfs_ialloc_get_rec(
}
/*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+ struct xfs_inobt_rec_incore *rec)
+{
+ xfs_inofree_t realfree;
+
+ /* if there are no holes, return the first available offset */
+ if (!xfs_inobt_issparse(rec->ir_holemask))
+ return xfs_lowbit64(rec->ir_free);
+
+ realfree = xfs_inobt_irec_to_allocmask(rec);
+ realfree &= rec->ir_free;
+
+ return xfs_lowbit64(realfree);
+}
+
+/*
* Allocate an inode using the inobt-only algorithm.
*/
STATIC int
@@ -961,7 +1329,7 @@ newino:
}
alloc_inode:
- offset = xfs_lowbit64(rec.ir_free);
+ offset = xfs_inobt_first_free_inode(&rec);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1210,7 +1578,7 @@ xfs_dialloc_ag(
if (error)
goto error_cur;
- offset = xfs_lowbit64(rec.ir_free);
+ offset = xfs_inobt_first_free_inode(&rec);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1439,6 +1807,83 @@ out_error:
return error;
}
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *rec,
+ struct xfs_bmap_free *flist)
+{
+ xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
+ int startidx, endidx;
+ int nextbit;
+ xfs_agblock_t agbno;
+ int contigblk;
+ DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+ if (!xfs_inobt_issparse(rec->ir_holemask)) {
+ /* not sparse, calculate extent info directly */
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+ XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
+ mp->m_ialloc_blks, flist, mp);
+ return;
+ }
+
+ /* holemask is only 16-bits (fits in an unsigned long) */
+ ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+ holemask[0] = rec->ir_holemask;
+
+ /*
+ * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+ * holemask and convert the start/end index of each range to an extent.
+ * We start with the start and end index both pointing at the first 0 in
+ * the mask.
+ */
+ startidx = endidx = find_first_zero_bit(holemask,
+ XFS_INOBT_HOLEMASK_BITS);
+ nextbit = startidx + 1;
+ while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+ nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+ nextbit);
+ /*
+ * If the next zero bit is contiguous, update the end index of
+ * the current range and continue.
+ */
+ if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+ nextbit == endidx + 1) {
+ endidx = nextbit;
+ goto next;
+ }
+
+ /*
+ * nextbit is not contiguous with the current end index. Convert
+ * the current start/end to an extent and add it to the free
+ * list.
+ */
+ agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+ contigblk = ((endidx - startidx + 1) *
+ XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+
+ ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+ ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+ flist, mp);
+
+ /* reset range to current bit and carry on... */
+ startidx = endidx = nextbit;
+
+next:
+ nextbit++;
+ }
+}
+
STATIC int
xfs_difree_inobt(
struct xfs_mount *mp,
@@ -1446,8 +1891,7 @@ xfs_difree_inobt(
struct xfs_buf *agbp,
xfs_agino_t agino,
struct xfs_bmap_free *flist,
- int *deleted,
- xfs_ino_t *first_ino,
+ struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
@@ -1501,20 +1945,23 @@ xfs_difree_inobt(
rec.ir_freecount++;
/*
- * When an inode cluster is free, it becomes eligible for removal
+ * When an inode chunk is free, it becomes eligible for removal. Don't
+ * remove the chunk if the block size is large enough for multiple inode
+ * chunks (that might not be free).
*/
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- (rec.ir_freecount == mp->m_ialloc_inos)) {
-
- *deleted = 1;
- *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+ xic->deleted = 1;
+ xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
/*
* Remove the inode cluster from the AGI B+Tree, adjust the
* AGI and Superblock inode counts, and mark the disk space
* to be freed when the transaction is committed.
*/
- ilen = mp->m_ialloc_inos;
+ ilen = rec.ir_freecount;
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1530,11 +1977,9 @@ xfs_difree_inobt(
goto error0;
}
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
- XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
- mp->m_ialloc_blks, flist, mp);
+ xfs_difree_inode_chunk(mp, agno, &rec, flist);
} else {
- *deleted = 0;
+ xic->deleted = 0;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -1599,7 +2044,9 @@ xfs_difree_finobt(
*/
XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
- error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+ ibtrec->ir_count,
+ ibtrec->ir_freecount,
ibtrec->ir_free, &i);
if (error)
goto error;
@@ -1634,8 +2081,13 @@ xfs_difree_finobt(
* free inode. Hence, if all of the inodes are free and we aren't
* keeping inode chunks permanently on disk, remove the record.
* Otherwise, update the record with the new information.
+ *
+ * Note that we currently can't free chunks when the block size is large
+ * enough for multiple chunks. Leave the finobt record to remain in sync
+ * with the inobt.
*/
- if (rec.ir_freecount == mp->m_ialloc_inos &&
+ if (rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
error = xfs_btree_delete(cur, &i);
if (error)
@@ -1671,8 +2123,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *deleted,/* set if inode cluster was deleted */
- xfs_ino_t *first_ino)/* first inode in deleted cluster */
+ struct xfs_icluster *xic) /* cluster info if deleted */
{
/* REFERENCED */
xfs_agblock_t agbno; /* block number containing inode */
@@ -1723,8 +2174,7 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
- &rec);
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
if (error)
goto error0;
@@ -1784,7 +2234,7 @@ xfs_imap_lookup(
}
xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
if (error)
return error;
@@ -2051,9 +2501,14 @@ xfs_agi_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
+ return false;
+ }
+
/*
* Validate the magic number of the agi block.
*/
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.h b/kernel/fs/xfs/libxfs/xfs_ialloc.h
index 100007d56..6e450df29 100644
--- a/kernel/fs/xfs/libxfs/xfs_ialloc.h
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc.h
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
/* Move inodes in clusters of this size */
#define XFS_INODE_BIG_CLUSTER_SIZE 8192
+struct xfs_icluster {
+ bool deleted; /* record is deleted */
+ xfs_ino_t first_ino; /* first inode number */
+ uint64_t alloc; /* inode phys. allocation bitmap for
+ * sparse chunks */
+};
+
/* Calculate and return the number of filesystem blocks per inode cluster */
static inline int
xfs_icluster_size_fsb(
@@ -44,8 +51,7 @@ xfs_icluster_size_fsb(
static inline struct xfs_dinode *
xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
{
- return (struct xfs_dinode *)
- (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+ return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
}
/*
@@ -90,8 +96,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *deleted, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino); /* first inode in deleted cluster */
+ struct xfs_icluster *ifree); /* cluster info if deleted */
/*
* Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
* Inode chunk initialisation routine
*/
int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
- struct list_head *buffer_list,
+ struct list_head *buffer_list, int icount,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c
index 964c465ca..f39b285be 100644
--- a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
union xfs_btree_rec *rec)
{
rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
- rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ rec->inobt.ir_u.sp.ir_holemask =
+ cpu_to_be16(cur->bc_rec.i.ir_holemask);
+ rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+ rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec->inobt.ir_u.f.ir_freecount =
+ cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ }
rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
}
@@ -230,7 +239,7 @@ xfs_inobt_verify(
case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
return false;
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
return blocklen / sizeof(xfs_inobt_rec_t);
return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
}
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+ struct xfs_inobt_rec_incore *rec)
+{
+ uint64_t bitmap = 0;
+ uint64_t inodespbit;
+ int nextbit;
+ uint allocbitmap;
+
+ /*
+ * The holemask has 16-bits for a 64 inode record. Therefore each
+ * holemask bit represents multiple inodes. Create a mask of bits to set
+ * in the allocmask for each holemask bit.
+ */
+ inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+ /*
+ * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+ * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+ * anything beyond the 16 holemask bits since this casts to a larger
+ * type.
+ */
+ allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+ /*
+ * allocbitmap is the inverted holemask so every set bit represents
+ * allocated inodes. To expand from 16-bit holemask granularity to
+ * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+ * bitmap for every holemask bit.
+ */
+ nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+ while (nextbit != -1) {
+ ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+ bitmap |= (inodespbit <<
+ (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+ nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+ }
+
+ return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+ struct xfs_mount *mp,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int inocount = 0;
+ int nextbit = 0;
+ uint64_t allocbmap;
+ int wordsz;
+
+ wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+ allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+ while (nextbit != -1) {
+ inocount++;
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+ nextbit + 1);
+ }
+
+ if (inocount != rec->ir_count)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+#endif /* DEBUG */
diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c..bd8845321 100644
--- a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
xfs_btnum_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+ struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec) 0
+#endif /* DEBUG */
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/kernel/fs/xfs/libxfs/xfs_inode_buf.c b/kernel/fs/xfs/libxfs/xfs_inode_buf.c
index 002b6b3a1..65485cfc4 100644
--- a/kernel/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/kernel/fs/xfs/libxfs/xfs_inode_buf.c
@@ -46,8 +46,7 @@ xfs_inobp_check(
j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
for (i = 0; i < j; i++) {
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- i * mp->m_sb.sb_inodesize);
+ dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
if (!dip->di_next_unlinked) {
xfs_alert(mp,
"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
@@ -63,11 +62,12 @@ xfs_inobp_check(
* has not had the inode cores stamped into it. Hence for readahead, the buffer
* may be potentially invalid.
*
- * If the readahead buffer is invalid, we don't want to mark it with an error,
- * but we do want to clear the DONE status of the buffer so that a followup read
- * will re-read it from disk. This will ensure that we don't get an unnecessary
- * warnings during log recovery and we don't get unnecssary panics on debug
- * kernels.
+ * If the readahead buffer is invalid, we need to mark it with an error and
+ * clear the DONE status of the buffer so that a followup read will re-read it
+ * from disk. We don't report the error otherwise to avoid warnings during log
+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
*/
static void
xfs_inode_buf_verify(
@@ -86,8 +86,7 @@ xfs_inode_buf_verify(
int di_ok;
xfs_dinode_t *dip;
- dip = (struct xfs_dinode *)xfs_buf_offset(bp,
- (i << mp->m_sb.sb_inodelog));
+ dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
XFS_DINODE_GOOD_VERSION(dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
@@ -95,6 +94,7 @@ xfs_inode_buf_verify(
XFS_RANDOM_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
+ xfs_buf_ioerror(bp, -EIO);
return;
}
@@ -186,7 +186,7 @@ xfs_imap_to_bp(
}
*bpp = bp;
- *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+ *dipp = xfs_buf_offset(bp, imap->im_boffset);
return 0;
}
@@ -306,7 +306,7 @@ xfs_dinode_verify(
return false;
if (be64_to_cpu(dip->di_ino) != ip->i_ino)
return false;
- if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
return false;
return true;
}
@@ -368,7 +368,7 @@ xfs_iread(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
ip->i_d.di_version = 3;
ip->i_d.di_ino = ip->i_ino;
- uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
} else
ip->i_d.di_version = 2;
return 0;
diff --git a/kernel/fs/xfs/libxfs/xfs_sb.c b/kernel/fs/xfs/libxfs/xfs_sb.c
index dc4bfc5d8..a0b071d88 100644
--- a/kernel/fs/xfs/libxfs/xfs_sb.c
+++ b/kernel/fs/xfs/libxfs/xfs_sb.c
@@ -35,6 +35,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_log.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -131,10 +132,11 @@ xfs_mount_validate_sb(
if (xfs_sb_has_compat_feature(sbp,
XFS_SB_FEAT_COMPAT_UNKNOWN)) {
xfs_warn(mp,
-"Superblock has unknown compatible features (0x%x) enabled.\n"
-"Using a more recent kernel is recommended.",
+"Superblock has unknown compatible features (0x%x) enabled.",
(sbp->sb_features_compat &
XFS_SB_FEAT_COMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Using a more recent kernel is recommended.");
}
if (xfs_sb_has_ro_compat_feature(sbp,
@@ -145,20 +147,32 @@ xfs_mount_validate_sb(
XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
xfs_warn(mp,
-"Attempted to mount read-only compatible filesystem read-write.\n"
+"Attempted to mount read-only compatible filesystem read-write.");
+ xfs_warn(mp,
"Filesystem can only be safely mounted read only.");
+
return -EINVAL;
}
}
if (xfs_sb_has_incompat_feature(sbp,
XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
xfs_warn(mp,
-"Superblock has unknown incompatible features (0x%x) enabled.\n"
-"Filesystem can not be safely mounted by this kernel.",
+"Superblock has unknown incompatible features (0x%x) enabled.",
(sbp->sb_features_incompat &
XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Filesystem can not be safely mounted by this kernel.");
return -EINVAL;
}
+ } else if (xfs_sb_version_hascrc(sbp)) {
+ /*
+ * We can't read verify the sb LSN because the read verifier is
+ * called before the log is allocated and processed. We know the
+ * log is set up before write verifier (!check_version) calls,
+ * so just check it here.
+ */
+ if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+ return -EFSCORRUPTED;
}
if (xfs_sb_version_has_pquotino(sbp)) {
@@ -174,6 +188,24 @@ xfs_mount_validate_sb(
return -EFSCORRUPTED;
}
+ /*
+ * Full inode chunks must be aligned to inode chunk size when
+ * sparse inodes are enabled to support the sparse chunk
+ * allocation algorithm and prevent overlapping inode records.
+ */
+ if (xfs_sb_version_hassparseinodes(sbp)) {
+ uint32_t align;
+
+ align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+ >> sbp->sb_blocklog;
+ if (sbp->sb_inoalignmt != align) {
+ xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+ sbp->sb_inoalignmt, align);
+ return -EINVAL;
+ }
+ }
+
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
xfs_warn(mp,
@@ -374,9 +406,17 @@ __xfs_sb_from_disk(
be32_to_cpu(from->sb_features_log_incompat);
/* crc is only used on disk, not in memory; just init to 0 here. */
to->sb_crc = 0;
- to->sb_pad = 0;
+ to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
+ /*
+ * sb_meta_uuid is only on disk if it differs from sb_uuid and the
+ * feature flag is set; if not set we keep it only in memory.
+ */
+ if (xfs_sb_version_hasmetauuid(to))
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+ else
+ uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
/* Convert on-disk flags to in-memory flags? */
if (convert_xquota)
xfs_sb_quota_from_disk(to);
@@ -516,8 +556,10 @@ xfs_sb_to_disk(
cpu_to_be32(from->sb_features_incompat);
to->sb_features_log_incompat =
cpu_to_be32(from->sb_features_log_incompat);
- to->sb_pad = 0;
+ to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
to->sb_lsn = cpu_to_be64(from->sb_lsn);
+ if (xfs_sb_version_hasmetauuid(from))
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
}
}
@@ -689,6 +731,11 @@ xfs_sb_mount_common(
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+
+ if (sbp->sb_spino_align)
+ mp->m_ialloc_min_blks = sbp->sb_spino_align;
+ else
+ mp->m_ialloc_min_blks = mp->m_ialloc_blks;
}
/*
@@ -792,12 +839,12 @@ xfs_sync_sb(
tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_log_sb(tp);
if (wait)
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
diff --git a/kernel/fs/xfs/libxfs/xfs_shared.h b/kernel/fs/xfs/libxfs/xfs_shared.h
index 8dda4b321..5be529707 100644
--- a/kernel/fs/xfs/libxfs/xfs_shared.h
+++ b/kernel/fs/xfs/libxfs/xfs_shared.h
@@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
count in superblock */
/*
- * Values for call flags parameter.
- */
-#define XFS_TRANS_RELEASE_LOG_RES 0x4
-#define XFS_TRANS_ABORT 0x8
-
-/*
* Field values for xfs_trans_mod_sb.
*/
#define XFS_TRANS_SB_ICOUNT 0x00000001
diff --git a/kernel/fs/xfs/libxfs/xfs_symlink_remote.c b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c
index e7e26bd64..cb6fd20a4 100644
--- a/kernel/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -31,6 +31,7 @@
#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
@@ -60,10 +61,11 @@ xfs_symlink_hdr_set(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return 0;
+ memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
dsl->sl_offset = cpu_to_be32(offset);
dsl->sl_bytes = cpu_to_be32(size);
- uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
dsl->sl_owner = cpu_to_be64(ino);
dsl->sl_blkno = cpu_to_be64(bp->b_bn);
bp->b_ops = &xfs_symlink_buf_ops;
@@ -107,7 +109,7 @@ xfs_symlink_verify(
return false;
if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
return false;
- if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
return false;
@@ -116,6 +118,8 @@ xfs_symlink_verify(
return false;
if (dsl->sl_owner == 0)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+ return false;
return true;
}
@@ -183,6 +187,7 @@ xfs_symlink_local_to_remote(
if (!xfs_sb_version_hascrc(&mp->m_sb)) {
bp->b_ops = NULL;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -198,4 +203,6 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+ ifp->if_bytes - 1);
}
diff --git a/kernel/fs/xfs/libxfs/xfs_trans_resv.h b/kernel/fs/xfs/libxfs/xfs_trans_resv.h
index 2d5bdfce6..797815012 100644
--- a/kernel/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/kernel/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,9 +73,9 @@ struct xfs_trans_resv {
* 2 trees * (2 blocks/level * max depth - 1) * block size
*/
#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+ ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
- ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+ ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
/*
* Per-directory log reservation for any directory change.
diff --git a/kernel/fs/xfs/libxfs/xfs_trans_space.h b/kernel/fs/xfs/libxfs/xfs_trans_space.h
index bf9c45793..41e0428d8 100644
--- a/kernel/fs/xfs/libxfs/xfs_trans_space.h
+++ b/kernel/fs/xfs/libxfs/xfs_trans_space.h
@@ -67,7 +67,7 @@
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
- (2 * XFS_AG_MAXLEVELS(mp))
+ (2 * (mp)->m_ag_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
#define XFS_LINK_SPACE_RES(mp,nl) \
diff --git a/kernel/fs/xfs/xfs_acl.c b/kernel/fs/xfs/xfs_acl.c
index 4b641676f..6bb470fbb 100644
--- a/kernel/fs/xfs/xfs_acl.c
+++ b/kernel/fs/xfs/xfs_acl.c
@@ -37,16 +37,19 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
- struct xfs_acl *aclp,
- int max_entries)
+ const struct xfs_acl *aclp,
+ int len,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
- struct xfs_acl_entry *ace;
+ const struct xfs_acl_entry *ace;
unsigned int count, i;
+ if (len < sizeof(*aclp))
+ return ERR_PTR(-EFSCORRUPTED);
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -160,10 +163,11 @@ xfs_get_acl(struct inode *inode, int type)
*/
if (error == -ENOATTR)
goto out_update_cache;
+ acl = ERR_PTR(error);
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
diff --git a/kernel/fs/xfs/xfs_acl.h b/kernel/fs/xfs/xfs_acl.h
index 3841b07f2..52f8255d6 100644
--- a/kernel/fs/xfs/xfs_acl.h
+++ b/kernel/fs/xfs/xfs_acl.h
@@ -20,7 +20,6 @@
struct inode;
struct posix_acl;
-struct xfs_inode;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
@@ -36,4 +35,7 @@ static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
# define posix_acl_access_exists(inode) 0
# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
+
+extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
+
#endif /* __XFS_ACL_H__ */
diff --git a/kernel/fs/xfs/xfs_aops.c b/kernel/fs/xfs/xfs_aops.c
index a56960dd1..29e7e5dd5 100644
--- a/kernel/fs/xfs/xfs_aops.c
+++ b/kernel/fs/xfs/xfs_aops.c
@@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc(
* We may pass freeze protection with a transaction. So tell lockdep
* we released it.
*/
- rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 1, _THIS_IP_);
+ __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
/*
* We hand off the transaction to the completion thread now, so
* clear the flag here.
@@ -145,7 +144,7 @@ xfs_setfilesize(
isize = xfs_new_eof(ip, offset + size);
if (!isize) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return 0;
}
@@ -155,7 +154,7 @@ xfs_setfilesize(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
STATIC int
@@ -171,8 +170,13 @@ xfs_setfilesize_ioend(
* Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
- rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
+ __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
+
+ /* we abort the update if there was an IO error */
+ if (ioend->io_error) {
+ xfs_trans_cancel(tp);
+ return ioend->io_error;
+ }
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
@@ -214,14 +218,17 @@ xfs_end_io(
ioend->io_error = -EIO;
goto done;
}
- if (ioend->io_error)
- goto done;
/*
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
+ * Detecting and handling completion IO errors is done individually
+ * for each case as different cleanup operations need to be performed
+ * on error.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ if (ioend->io_error)
+ goto done;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
} else if (ioend->io_append_trans) {
@@ -351,13 +358,12 @@ xfs_imap_valid(
*/
STATIC void
xfs_end_bio(
- struct bio *bio,
- int error)
+ struct bio *bio)
{
xfs_ioend_t *ioend = bio->bi_private;
- ASSERT(atomic_read(&bio->bi_cnt) >= 1);
- ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+ if (!ioend->io_error)
+ ioend->io_error = bio->bi_error;
/* Toss bio and pass work off to an xfsdatad thread */
bio->bi_private = NULL;
@@ -383,8 +389,7 @@ STATIC struct bio *
xfs_alloc_ioend_bio(
struct buffer_head *bh)
{
- int nvecs = bio_get_nr_vecs(bh->b_bdev);
- struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
+ struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
ASSERT(bio->bi_private == NULL);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
@@ -1254,13 +1259,28 @@ xfs_vm_releasepage(
* the DIO. There is only going to be one reference to the ioend and its life
* cycle is constrained by the DIO completion code. hence we don't need
* reference counting here.
+ *
+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
+ * extending the file size. We won't know for sure until IO completion is run
+ * and the actual max write offset is communicated to the IO completion
+ * routine.
+ *
+ * For DAX page faults, we are preparing to never see unwritten extents here,
+ * nor should we ever extend the inode size. Hence we will soon have nothing to
+ * do here for this case, ensuring we don't have to provide an IO completion
+ * callback to free an ioend that we don't actually need for a fault into the
+ * page at offset (2^63 - 1FSB) bytes.
*/
+
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
- xfs_off_t offset)
+ xfs_off_t offset,
+ bool dax_fault)
{
struct xfs_ioend *ioend;
xfs_off_t size = bh_result->b_size;
@@ -1273,6 +1293,13 @@ xfs_map_direct(
trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+ if (dax_fault) {
+ ASSERT(type == XFS_IO_OVERWRITE);
+ trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+ imap);
+ return;
+ }
+
if (bh_result->b_private) {
ioend = bh_result->b_private;
ASSERT(ioend->io_size > 0);
@@ -1287,7 +1314,8 @@ xfs_map_direct(
ioend->io_size, ioend->io_type,
imap);
} else if (type == XFS_IO_UNWRITTEN ||
- offset + size > i_size_read(inode)) {
+ offset + size > i_size_read(inode) ||
+ offset + size < 0) {
ioend = xfs_alloc_ioend(inode, type);
ioend->io_offset = offset;
ioend->io_size = size;
@@ -1349,7 +1377,8 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- int direct)
+ bool direct,
+ bool dax_fault)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1397,23 +1426,26 @@ __xfs_get_blocks(
if (error)
goto out_unlock;
+ /* for DAX, we convert unwritten extents directly */
if (create &&
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK))) {
+ imap.br_startblock == DELAYSTARTBLOCK) ||
+ (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
if (direct || xfs_get_extsz_hint(ip)) {
/*
- * Drop the ilock in preparation for starting the block
- * allocation transaction. It will be retaken
- * exclusively inside xfs_iomap_write_direct for the
- * actual allocation.
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
*/
- xfs_iunlock(ip, lockmode);
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
if (error)
return error;
new = 1;
+
} else {
/*
* Delalloc reservations do not require a transaction,
@@ -1444,6 +1476,12 @@ __xfs_get_blocks(
goto out_unlock;
}
+ if (IS_DAX(inode) && create) {
+ ASSERT(!ISUNWRITTEN(&imap));
+ /* zeroing is not needed at a higher layer */
+ new = 0;
+ }
+
/* trim mapping down to size requested */
if (direct || size > (1 << inode->i_blkbits))
xfs_map_trim_size(inode, iblock, bh_result,
@@ -1461,7 +1499,8 @@ __xfs_get_blocks(
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
if (create && direct)
- xfs_map_direct(inode, bh_result, &imap, offset);
+ xfs_map_direct(inode, bh_result, &imap, offset,
+ dax_fault);
}
/*
@@ -1508,49 +1547,39 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
}
-STATIC int
+int
xfs_get_blocks_direct(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
}
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
+int
+xfs_get_blocks_dax_fault(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ioend *ioend = private;
-
- trace_xfs_gbmap_direct_endio(ip, offset, size,
- ioend ? ioend->io_type : 0, NULL);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+}
- if (!ioend) {
- ASSERT(offset + size <= i_size_read(inode));
- return;
- }
+static void
+__xfs_end_io_direct_write(
+ struct inode *inode,
+ struct xfs_ioend *ioend,
+ loff_t offset,
+ ssize_t size)
+{
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
goto out_end_io;
/*
@@ -1587,10 +1616,10 @@ xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
- spin_lock(&ip->i_flags_lock);
+ spin_lock(&XFS_I(inode)->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
- spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&XFS_I(inode)->i_flags_lock);
/*
* If we are doing an append IO that needs to update the EOF on disk,
@@ -1607,6 +1636,59 @@ out_end_io:
return;
}
+/*
+ * Complete a direct I/O write request.
+ *
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_ioend *ioend = private;
+
+ trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
+ ioend ? ioend->io_type : 0, NULL);
+
+ if (!ioend) {
+ ASSERT(offset + size <= i_size_read(inode));
+ return;
+ }
+
+ __xfs_end_io_direct_write(inode, ioend, offset, size);
+}
+
+static inline ssize_t
+xfs_vm_do_dio(
+ struct inode *inode,
+ struct kiocb *iocb,
+ struct iov_iter *iter,
+ loff_t offset,
+ void (*endio)(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private),
+ int flags)
+{
+ struct block_device *bdev;
+
+ if (IS_DAX(inode))
+ return dax_do_io(iocb, inode, iter, offset,
+ xfs_get_blocks_direct, endio, 0);
+
+ bdev = xfs_find_bdev_for_inode(inode);
+ return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+ xfs_get_blocks_direct, endio, NULL, flags);
+}
+
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
@@ -1614,16 +1696,11 @@ xfs_vm_direct_IO(
loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
- struct block_device *bdev = xfs_find_bdev_for_inode(inode);
- if (iov_iter_rw(iter) == WRITE) {
- return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct,
- xfs_end_io_direct_write, NULL,
- DIO_ASYNC_EXTEND);
- }
- return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct, NULL, NULL, 0);
+ if (iov_iter_rw(iter) == WRITE)
+ return xfs_vm_do_dio(inode, iocb, iter, offset,
+ xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
+ return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
}
/*
@@ -1874,6 +1951,7 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
+ struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1893,6 +1971,11 @@ xfs_vm_set_page_dirty(
offset += 1 << inode->i_blkbits;
} while (bh != head);
}
+ /*
+ * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+ * per-memcg dirty page counters.
+ */
+ memcg = mem_cgroup_begin_page_stat(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1903,13 +1986,15 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
+ mem_cgroup_end_page_stat(memcg);
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
diff --git a/kernel/fs/xfs/xfs_aops.h b/kernel/fs/xfs/xfs_aops.h
index ac644e013..f6ffc9ae5 100644
--- a/kernel/fs/xfs/xfs_aops.h
+++ b/kernel/fs/xfs/xfs_aops.h
@@ -53,7 +53,13 @@ typedef struct xfs_ioend {
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+int xfs_get_blocks(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
+int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
+int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/kernel/fs/xfs/xfs_attr_inactive.c b/kernel/fs/xfs/xfs_attr_inactive.c
index 73e75a87a..2bb959ada 100644
--- a/kernel/fs/xfs/xfs_attr_inactive.c
+++ b/kernel/fs/xfs/xfs_attr_inactive.c
@@ -394,7 +394,6 @@ xfs_attr_inactive(
{
struct xfs_trans *trans;
struct xfs_mount *mp;
- int cancel_flags = 0;
int lock_mode = XFS_ILOCK_SHARED;
int error = 0;
@@ -423,7 +422,6 @@ xfs_attr_inactive(
goto out_cancel;
lock_mode = XFS_ILOCK_EXCL;
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT;
xfs_ilock(dp, lock_mode);
if (!XFS_IFORK_Q(dp))
@@ -455,12 +453,12 @@ xfs_attr_inactive(
/* Reset the attribute fork - this also destroys the in-core fork */
xfs_attr_fork_remove(dp, trans);
- error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(trans);
xfs_iunlock(dp, lock_mode);
return error;
out_cancel:
- xfs_trans_cancel(trans, cancel_flags);
+ xfs_trans_cancel(trans);
out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
if (dp->i_afp)
diff --git a/kernel/fs/xfs/xfs_attr_list.c b/kernel/fs/xfs/xfs_attr_list.c
index 65fb37a18..0ef7c2ed3 100644
--- a/kernel/fs/xfs/xfs_attr_list.c
+++ b/kernel/fs/xfs/xfs_attr_list.c
@@ -511,7 +511,7 @@ xfs_attr_list_int(
xfs_inode_t *dp = context->dp;
uint lock_mode;
- XFS_STATS_INC(xs_attr_list);
+ XFS_STATS_INC(dp->i_mount, xs_attr_list);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
diff --git a/kernel/fs/xfs/xfs_bmap_util.c b/kernel/fs/xfs/xfs_bmap_util.c
index a52bbd3ab..dbae6490a 100644
--- a/kernel/fs/xfs/xfs_bmap_util.c
+++ b/kernel/fs/xfs/xfs_bmap_util.c
@@ -57,6 +57,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
}
/*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+ struct xfs_inode *ip,
+ xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
+ sector_t block = XFS_BB_TO_FSBT(mp, sector);
+ ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
+
+ if (IS_DAX(VFS_I(ip)))
+ return dax_clear_blocks(VFS_I(ip), block, size);
+
+ /*
+ * let the block layer decide on the fastest method of
+ * implementing the zeroing.
+ */
+ return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+
+}
+
+/*
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done
* last due to locking considerations. We never free any extents in
@@ -67,78 +96,68 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
*/
int /* error */
xfs_bmap_finish(
- xfs_trans_t **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *committed) /* xact committed or not */
+ struct xfs_trans **tp, /* transaction pointer addr */
+ struct xfs_bmap_free *flist, /* i/o: list extents to free */
+ int *committed)/* xact committed or not */
{
- xfs_efd_log_item_t *efd; /* extent free data */
- xfs_efi_log_item_t *efi; /* extent free intention */
- int error; /* error return value */
- xfs_bmap_free_item_t *free; /* free extent item */
- struct xfs_trans_res tres; /* new log reservation */
- xfs_mount_t *mp; /* filesystem mount structure */
- xfs_bmap_free_item_t *next; /* next item on free list */
- xfs_trans_t *ntp; /* new transaction pointer */
+ struct xfs_efd_log_item *efd; /* extent free data */
+ struct xfs_efi_log_item *efi; /* extent free intention */
+ int error; /* error return value */
+ struct xfs_bmap_free_item *free; /* free extent item */
+ struct xfs_bmap_free_item *next; /* next item on free list */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
if (flist->xbf_count == 0) {
*committed = 0;
return 0;
}
- ntp = *tp;
- efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+ efi = xfs_trans_get_efi(*tp, flist->xbf_count);
for (free = flist->xbf_first; free; free = free->xbfi_next)
- xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+ xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
- tres.tr_logres = ntp->t_log_res;
- tres.tr_logcount = ntp->t_log_count;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- ntp = xfs_trans_dup(*tp);
- error = xfs_trans_commit(*tp, 0);
- *tp = ntp;
- *committed = 1;
- /*
- * We have a new transaction, so we should return committed=1,
- * even though we're returning an error.
- */
- if (error)
+ error = __xfs_trans_roll(tp, NULL, committed);
+ if (error) {
+ /*
+ * If the transaction was committed, drop the EFD reference
+ * since we're bailing out of here. The other reference is
+ * dropped when the EFI hits the AIL.
+ *
+ * If the transaction was not committed, the EFI is freed by the
+ * EFI item unlock handler on abort. Also, we have a new
+ * transaction so we should return committed=1 even though we're
+ * returning an error.
+ */
+ if (*committed) {
+ xfs_efi_release(efi);
+ xfs_force_shutdown((*tp)->t_mountp,
+ (error == -EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE :
+ SHUTDOWN_META_IO_ERROR);
+ } else {
+ *committed = 1;
+ }
+
return error;
+ }
/*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
+ * Get an EFD and free each extent in the list, logging to the EFD in
+ * the process. The remaining bmap free list is cleaned up by the caller
+ * on error.
*/
- xfs_log_ticket_put(ntp->t_ticket);
-
- error = xfs_trans_reserve(ntp, &tres, 0, 0);
- if (error)
- return error;
- efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
for (free = flist->xbf_first; free != NULL; free = next) {
next = free->xbfi_next;
- if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
- free->xbfi_blockcount))) {
- /*
- * The bmap free list will be cleaned up at a
- * higher level. The EFI will be canceled when
- * this transaction is aborted.
- * Need to force shutdown here to make sure it
- * happens, since this transaction may not be
- * dirty yet.
- */
- mp = ntp->t_mountp;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp,
- (error == -EFSCORRUPTED) ?
- SHUTDOWN_CORRUPT_INCORE :
- SHUTDOWN_META_IO_ERROR);
+
+ error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ if (error)
return error;
- }
- xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
- free->xbfi_blockcount);
+
xfs_bmap_del_free(flist, NULL, free);
}
+
return 0;
}
@@ -239,6 +258,13 @@ xfs_bmap_rtalloc(
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+
+ /* Zero the extent if we were asked to do so */
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
+ if (error)
+ return error;
+ }
} else {
ap->length = 0;
}
@@ -878,7 +904,7 @@ xfs_free_eofblocks(
if (need_iolock) {
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return -EAGAIN;
}
}
@@ -886,7 +912,7 @@ xfs_free_eofblocks(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
if (need_iolock)
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
@@ -908,12 +934,9 @@ xfs_free_eofblocks(
* If we get an error at this point we simply don't
* bother truncating the file.
*/
- xfs_trans_cancel(tp,
- (XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT));
+ xfs_trans_cancel(tp);
} else {
- error = xfs_trans_commit(tp,
- XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (!error)
xfs_inode_clear_eofblocks_tag(ip);
}
@@ -1026,7 +1049,7 @@ xfs_alloc_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1040,7 +1063,7 @@ xfs_alloc_file_space(
xfs_bmap_init(&free_list, &firstfsb);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, &firstfsb,
- 0, imapp, &nimaps, &free_list);
+ resblks, imapp, &nimaps, &free_list);
if (error) {
goto error0;
}
@@ -1053,7 +1076,7 @@ xfs_alloc_file_space(
goto error0;
}
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error) {
break;
@@ -1077,7 +1100,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
error1: /* Just cancel transaction */
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1133,14 +1156,29 @@ xfs_zero_remaining_bytes(
break;
ASSERT(imap.br_blockcount >= 1);
ASSERT(imap.br_startoff == offset_fsb);
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+ if (imap.br_startblock == HOLESTARTBLOCK ||
+ imap.br_state == XFS_EXT_UNWRITTEN) {
+ /* skip the entire extent */
+ lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+ imap.br_blockcount) - 1;
+ continue;
+ }
+
lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
if (lastoffset > endoff)
lastoffset = endoff;
- if (imap.br_startblock == HOLESTARTBLOCK)
- continue;
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- if (imap.br_state == XFS_EXT_UNWRITTEN)
+
+ /* DAX can just zero the backing device directly */
+ if (IS_DAX(VFS_I(ip))) {
+ error = dax_zero_page_range(VFS_I(ip), offset,
+ lastoffset - offset + 1,
+ xfs_get_blocks_direct);
+ if (error)
+ return error;
continue;
+ }
error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -1289,7 +1327,7 @@ xfs_free_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1320,7 +1358,7 @@ xfs_free_file_space(
goto error0;
}
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
@@ -1330,7 +1368,7 @@ xfs_free_file_space(
error0:
xfs_bmap_cancel(&free_list);
error1:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto out;
}
@@ -1462,7 +1500,7 @@ xfs_shift_file_space(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
@@ -1472,7 +1510,7 @@ xfs_shift_file_space(
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
XFS_QMOPT_RES_REGBLKS);
if (error)
- goto out;
+ goto out_trans_cancel;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1486,19 +1524,21 @@ xfs_shift_file_space(
&done, stop_fsb, &first_block, &free_list,
direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
if (error)
- goto out;
+ goto out_bmap_cancel;
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
- goto out;
+ goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
}
return error;
-out:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
@@ -1718,7 +1758,7 @@ xfs_swap_extents(
tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -1901,7 +1941,7 @@ xfs_swap_extents(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
@@ -1915,6 +1955,6 @@ out_unlock:
goto out;
out_trans_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out;
}
diff --git a/kernel/fs/xfs/xfs_buf.c b/kernel/fs/xfs/xfs_buf.c
index 1790b00be..39090fc56 100644
--- a/kernel/fs/xfs/xfs_buf.c
+++ b/kernel/fs/xfs/xfs_buf.c
@@ -201,7 +201,7 @@ _xfs_buf_alloc(
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
- XFS_STATS_INC(xb_create);
+ XFS_STATS_INC(target->bt_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
return bp;
@@ -354,15 +354,16 @@ retry:
*/
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
__func__, gfp_mask);
- XFS_STATS_INC(xb_page_retries);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- XFS_STATS_INC(xb_page_found);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
@@ -438,7 +439,6 @@ _xfs_buf_find(
xfs_buf_flags_t flags,
xfs_buf_t *new_bp)
{
- size_t numbytes;
struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent;
@@ -450,10 +450,9 @@ _xfs_buf_find(
for (i = 0; i < nmaps; i++)
numblks += map[i].bm_len;
- numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
/*
@@ -518,7 +517,7 @@ _xfs_buf_find(
new_bp->b_pag = pag;
spin_unlock(&pag->pag_buf_lock);
} else {
- XFS_STATS_INC(xb_miss_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
}
@@ -531,11 +530,11 @@ found:
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
- XFS_STATS_INC(xb_busy_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
return NULL;
}
xfs_buf_lock(bp);
- XFS_STATS_INC(xb_get_locked_waited);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
}
/*
@@ -551,7 +550,7 @@ found:
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(xb_get_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
return bp;
}
@@ -605,7 +604,14 @@ found:
}
}
- XFS_STATS_INC(xb_get);
+ /*
+ * Clear b_error if this is a lookup from a caller that doesn't expect
+ * valid data to be found in the buffer.
+ */
+ if (!(flags & XBF_READ))
+ xfs_buf_ioerror(bp, 0);
+
+ XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
}
@@ -645,7 +651,7 @@ xfs_buf_read_map(
trace_xfs_buf_read(bp, flags, _RET_IP_);
if (!XFS_BUF_ISDONE(bp)) {
- XFS_STATS_INC(xb_get_read);
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
@@ -1096,8 +1102,7 @@ xfs_bwrite(
STATIC void
xfs_buf_bio_end_io(
- struct bio *bio,
- int error)
+ struct bio *bio)
{
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
@@ -1105,10 +1110,10 @@ xfs_buf_bio_end_io(
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (error) {
+ if (bio->bi_error) {
spin_lock(&bp->b_lock);
if (!bp->b_io_error)
- bp->b_io_error = error;
+ bp->b_io_error = bio->bi_error;
spin_unlock(&bp->b_lock);
}
@@ -1419,9 +1424,9 @@ xfs_buf_submit_wait(
return error;
}
-xfs_caddr_t
+void *
xfs_buf_offset(
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
size_t offset)
{
struct page *page;
@@ -1431,7 +1436,7 @@ xfs_buf_offset(
offset += bp->b_offset;
page = bp->b_pages[offset >> PAGE_SHIFT];
- return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+ return page_address(page) + (offset & (PAGE_SIZE-1));
}
/*
@@ -1522,6 +1527,16 @@ xfs_wait_buftarg(
LIST_HEAD(dispose);
int loop = 0;
+ /*
+ * We need to flush the buffer workqueue to ensure that all IO
+ * completion processing is 100% done. Just waiting on buffer locks is
+ * not sufficient for async IO as the reference count held over IO is
+ * not released until after the buffer lock is dropped. Hence we need to
+ * ensure here that all reference counts have been dropped before we
+ * start walking the LRU list.
+ */
+ drain_workqueue(btp->bt_mount->m_buf_workqueue);
+
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
@@ -1533,9 +1548,10 @@ xfs_wait_buftarg(
list_del_init(&bp->b_lru);
if (bp->b_flags & XBF_WRITE_FAIL) {
xfs_alert(btp->bt_mount,
-"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
-"Please run xfs_repair to determine the extent of the problem.",
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!",
(long long)bp->b_bn);
+ xfs_alert(btp->bt_mount,
+"Please run xfs_repair to determine the extent of the problem.");
}
xfs_buf_rele(bp);
}
diff --git a/kernel/fs/xfs/xfs_buf.h b/kernel/fs/xfs/xfs_buf.h
index 75ff5d5a7..c79b717d9 100644
--- a/kernel/fs/xfs/xfs_buf.h
+++ b/kernel/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
@@ -299,7 +300,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
/* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+extern void *xfs_buf_offset(struct xfs_buf *, size_t);
/* Delayed Write Buffer Routines */
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
diff --git a/kernel/fs/xfs/xfs_buf_item.c b/kernel/fs/xfs/xfs_buf_item.c
index 092d652bc..7e986da34 100644
--- a/kernel/fs/xfs/xfs_buf_item.c
+++ b/kernel/fs/xfs/xfs_buf_item.c
@@ -647,11 +647,7 @@ xfs_buf_item_unlock(
xfs_buf_item_relse(bp);
else if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
- if (lip->li_flags & XFS_LI_IN_AIL) {
- spin_lock(&lip->li_ailp->xa_lock);
- xfs_trans_ail_delete(lip->li_ailp, lip,
- SHUTDOWN_LOG_IO_ERROR);
- }
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
}
}
@@ -750,13 +746,13 @@ xfs_buf_item_free_format(
* buffer (see xfs_buf_attach_iodone() below), then put the
* buf log item at the front.
*/
-void
+int
xfs_buf_item_init(
- xfs_buf_t *bp,
- xfs_mount_t *mp)
+ struct xfs_buf *bp,
+ struct xfs_mount *mp)
{
- xfs_log_item_t *lip = bp->b_fspriv;
- xfs_buf_log_item_t *bip;
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_buf_log_item *bip;
int chunks;
int map_size;
int error;
@@ -770,12 +766,11 @@ xfs_buf_item_init(
*/
ASSERT(bp->b_target->bt_mount == mp);
if (lip != NULL && lip->li_type == XFS_LI_BUF)
- return;
+ return 0;
bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
- xfs_buf_hold(bp);
/*
* chunks is the number of XFS_BLF_CHUNK size pieces the buffer
@@ -788,6 +783,11 @@ xfs_buf_item_init(
*/
error = xfs_buf_item_get_format(bip, bp->b_map_count);
ASSERT(error == 0);
+ if (error) { /* to stop gcc throwing set-but-unused warnings */
+ kmem_zone_free(xfs_buf_item_zone, bip);
+ return error;
+ }
+
for (i = 0; i < bip->bli_format_count; i++) {
chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
@@ -807,6 +807,8 @@ xfs_buf_item_init(
if (bp->b_fspriv)
bip->bli_item.li_bio_list = bp->b_fspriv;
bp->b_fspriv = bip;
+ xfs_buf_hold(bp);
+ return 0;
}
diff --git a/kernel/fs/xfs/xfs_buf_item.h b/kernel/fs/xfs/xfs_buf_item.h
index 3f3455a41..f7eba99d1 100644
--- a/kernel/fs/xfs/xfs_buf_item.h
+++ b/kernel/fs/xfs/xfs_buf_item.h
@@ -61,7 +61,7 @@ typedef struct xfs_buf_log_item {
struct xfs_buf_log_format __bli_format; /* embedded in-log header */
} xfs_buf_log_item_t;
-void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
diff --git a/kernel/fs/xfs/xfs_dir2_readdir.c b/kernel/fs/xfs/xfs_dir2_readdir.c
index 098cd78fe..642d55d10 100644
--- a/kernel/fs/xfs/xfs_dir2_readdir.c
+++ b/kernel/fs/xfs/xfs_dir2_readdir.c
@@ -171,6 +171,7 @@ xfs_dir2_block_getdents(
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
+ int lock_mode;
/*
* If the block number in the offset is out of range, we're done.
@@ -178,7 +179,9 @@ xfs_dir2_block_getdents(
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
+ lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir3_block_read(NULL, dp, &bp);
+ xfs_iunlock(dp, lock_mode);
if (error)
return error;
@@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents(
* current buffer, need to get another one.
*/
if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+ int lock_mode;
+ lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
&curoff, &bp);
+ xfs_iunlock(dp, lock_mode);
if (error || !map_info->map_valid)
break;
@@ -653,7 +659,6 @@ xfs_readdir(
struct xfs_da_args args = { NULL };
int rval;
int v;
- uint lock_mode;
trace_xfs_readdir(dp);
@@ -661,12 +666,12 @@ xfs_readdir(
return -EIO;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_getdents);
+ XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
- lock_mode = xfs_ilock_data_map_shared(dp);
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -675,7 +680,7 @@ xfs_readdir(
rval = xfs_dir2_block_getdents(&args, ctx);
else
rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
- xfs_iunlock(dp, lock_mode);
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return rval;
}
diff --git a/kernel/fs/xfs/xfs_dquot.c b/kernel/fs/xfs/xfs_dquot.c
index 02c01bbbc..7ac6c5c58 100644
--- a/kernel/fs/xfs/xfs_dquot.c
+++ b/kernel/fs/xfs/xfs_dquot.c
@@ -75,9 +75,9 @@ xfs_qm_dqdestroy(
ASSERT(list_empty(&dqp->q_lru));
mutex_destroy(&dqp->q_qlock);
- kmem_zone_free(xfs_qm_dqzone, dqp);
- XFS_STATS_DEC(xs_qm_dquot);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
}
/*
@@ -251,7 +251,7 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -568,8 +568,6 @@ xfs_qm_dqread(
struct xfs_buf *bp;
struct xfs_trans *tp = NULL;
int error;
- int cancelflags = 0;
-
dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
@@ -607,7 +605,7 @@ xfs_qm_dqread(
break;
}
- XFS_STATS_INC(xs_qm_dquot);
+ XFS_STATS_INC(mp, xs_qm_dquot);
trace_xfs_dqread(dqp);
@@ -617,7 +615,6 @@ xfs_qm_dqread(
XFS_QM_DQALLOC_SPACE_RES(mp), 0);
if (error)
goto error1;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
}
/*
@@ -632,7 +629,6 @@ xfs_qm_dqread(
* allocate (ENOENT).
*/
trace_xfs_dqread_fail(dqp);
- cancelflags |= XFS_TRANS_ABORT;
goto error1;
}
@@ -670,7 +666,7 @@ xfs_qm_dqread(
xfs_trans_brelse(tp, bp);
if (tp) {
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error0;
}
@@ -680,7 +676,7 @@ xfs_qm_dqread(
error1:
if (tp)
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
error0:
xfs_qm_dqdestroy(dqp);
*O_dqpp = NULL;
@@ -751,12 +747,12 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_hit(dqp);
- XFS_STATS_INC(xs_qm_dqcachehits);
+ XFS_STATS_INC(mp, xs_qm_dqcachehits);
*O_dqpp = dqp;
return 0;
}
mutex_unlock(&qi->qi_tree_lock);
- XFS_STATS_INC(xs_qm_dqcachemisses);
+ XFS_STATS_INC(mp, xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -810,7 +806,7 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
xfs_qm_dqdestroy(dqp);
- XFS_STATS_INC(xs_qm_dquot_dups);
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
@@ -850,7 +846,7 @@ xfs_qm_dqput(
trace_xfs_dqput_free(dqp);
if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
- XFS_STATS_INC(xs_qm_dquot_unused);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
}
@@ -958,12 +954,8 @@ xfs_qm_dqflush(
struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
dqp->dq_flags &= ~XFS_DQ_DIRTY;
- spin_lock(&mp->m_ail->xa_lock);
- if (lip->li_flags & XFS_LI_IN_AIL)
- xfs_trans_ail_delete(mp->m_ail, lip,
- SHUTDOWN_CORRUPT_INCORE);
- else
- spin_unlock(&mp->m_ail->xa_lock);
+ xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE);
+
error = -EIO;
goto out_unlock;
}
diff --git a/kernel/fs/xfs/xfs_error.c b/kernel/fs/xfs/xfs_error.c
index 338e50bbf..74d0e5966 100644
--- a/kernel/fs/xfs/xfs_error.c
+++ b/kernel/fs/xfs/xfs_error.c
@@ -127,7 +127,7 @@ xfs_error_report(
struct xfs_mount *mp,
const char *filename,
int linenum,
- inst_t *ra)
+ void *ra)
{
if (level <= xfs_error_level) {
xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
@@ -146,7 +146,7 @@ xfs_corruption_error(
void *p,
const char *filename,
int linenum,
- inst_t *ra)
+ void *ra)
{
if (level <= xfs_error_level)
xfs_hex_dump(p, 64);
diff --git a/kernel/fs/xfs/xfs_error.h b/kernel/fs/xfs/xfs_error.h
index c0394ed12..4ed3042a0 100644
--- a/kernel/fs/xfs/xfs_error.h
+++ b/kernel/fs/xfs/xfs_error.h
@@ -21,10 +21,10 @@
struct xfs_mount;
extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
- const char *filename, int linenum, inst_t *ra);
+ const char *filename, int linenum, void *ra);
extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, void *p, const char *filename,
- int linenum, inst_t *ra);
+ int linenum, void *ra);
extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERROR_REPORT(e, lvl, mp) \
diff --git a/kernel/fs/xfs/xfs_extfree_item.c b/kernel/fs/xfs/xfs_extfree_item.c
index cb7fe64cd..4aa015321 100644
--- a/kernel/fs/xfs/xfs_extfree_item.c
+++ b/kernel/fs/xfs/xfs_extfree_item.c
@@ -47,28 +47,6 @@ xfs_efi_item_free(
}
/*
- * Freeing the efi requires that we remove it from the AIL if it has already
- * been placed there. However, the EFI may not yet have been placed in the AIL
- * when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the EFI.
- */
-STATIC void
-__xfs_efi_release(
- struct xfs_efi_log_item *efip)
-{
- struct xfs_ail *ailp = efip->efi_item.li_ailp;
-
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- spin_lock(&ailp->xa_lock);
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &efip->efi_item,
- SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
-}
-
-/*
* This returns the number of iovecs needed to log the given efi item.
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
@@ -128,12 +106,12 @@ xfs_efi_item_pin(
}
/*
- * While EFIs cannot really be pinned, the unpin operation is the last place at
- * which the EFI is manipulated during a transaction. If we are being asked to
- * remove the EFI it's because the transaction has been cancelled and by
- * definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it. Otherwise coordinate with xfs_efi_release()
- * to determine who gets to free the EFI.
+ * The unpin operation is the last place an EFI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the EFI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the EFI to either construct
+ * and commit the EFD or drop the EFD's reference in the event of error. Simply
+ * drop the log's EFI reference now that the log is done with it.
*/
STATIC void
xfs_efi_item_unpin(
@@ -141,15 +119,7 @@ xfs_efi_item_unpin(
int remove)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-
- if (remove) {
- ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
- if (lip->li_desc)
- xfs_trans_del_item(lip);
- xfs_efi_item_free(efip);
- return;
- }
- __xfs_efi_release(efip);
+ xfs_efi_release(efip);
}
/*
@@ -167,6 +137,11 @@ xfs_efi_item_push(
return XFS_ITEM_PINNED;
}
+/*
+ * The EFI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an EFD isn't going to be
+ * constructed and thus we free the EFI here directly.
+ */
STATIC void
xfs_efi_item_unlock(
struct xfs_log_item *lip)
@@ -239,7 +214,7 @@ xfs_efi_init(
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
- efip->efi_format.efi_id = (__psint_t)(void*)efip;
+ efip->efi_format.efi_id = (uintptr_t)(void *)efip;
atomic_set(&efip->efi_next_extent, 0);
atomic_set(&efip->efi_refcount, 2);
@@ -301,23 +276,19 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
/*
- * This is called by the efd item code below to release references to the given
- * efi item. Each efd calls this with the number of extents that it has
- * logged, and when the sum of these reaches the total number of extents logged
- * by this efi item we can free the efi item.
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
*/
void
-xfs_efi_release(xfs_efi_log_item_t *efip,
- uint nextents)
+xfs_efi_release(
+ struct xfs_efi_log_item *efip)
{
- ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
- if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
- /* recovery needs us to drop the EFI reference, too */
- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
- __xfs_efi_release(efip);
-
- __xfs_efi_release(efip);
- /* efip may now have been freed, do not reference it again. */
+ if (atomic_dec_and_test(&efip->efi_refcount)) {
+ xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_efi_item_free(efip);
}
}
@@ -415,20 +386,27 @@ xfs_efd_item_push(
return XFS_ITEM_PINNED;
}
+/*
+ * The EFD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the EFI and free the EFD.
+ */
STATIC void
xfs_efd_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
- xfs_efd_item_free(EFD_ITEM(lip));
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
+ if (lip->li_flags & XFS_LI_ABORTED) {
+ xfs_efi_release(efdp->efd_efip);
+ xfs_efd_item_free(efdp);
+ }
}
/*
- * When the efd item is committed to disk, all we need to do
- * is delete our reference to our partner efi item and then
- * free ourselves. Since we're freeing ourselves we must
- * return -1 to keep the transaction code from further referencing
- * this item.
+ * When the efd item is committed to disk, all we need to do is delete our
+ * reference to our partner efi item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from further
+ * referencing this item.
*/
STATIC xfs_lsn_t
xfs_efd_item_committed(
@@ -438,13 +416,14 @@ xfs_efd_item_committed(
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
/*
- * If we got a log I/O error, it's always the case that the LR with the
- * EFI got unpinned and freed before the EFD got aborted.
+ * Drop the EFI reference regardless of whether the EFD has been
+ * aborted. Once the EFD transaction is constructed, it is the sole
+ * responsibility of the EFD to release the EFI (even if the EFI is
+ * aborted due to log I/O error).
*/
- if (!(lip->li_flags & XFS_LI_ABORTED))
- xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
-
+ xfs_efi_release(efdp->efd_efip);
xfs_efd_item_free(efdp);
+
return (xfs_lsn_t)-1;
}
diff --git a/kernel/fs/xfs/xfs_extfree_item.h b/kernel/fs/xfs/xfs_extfree_item.h
index 0ffbce32d..8fa865170 100644
--- a/kernel/fs/xfs/xfs_extfree_item.h
+++ b/kernel/fs/xfs/xfs_extfree_item.h
@@ -39,9 +39,28 @@ struct kmem_zone;
* "extent free done" log item described below.
*
* The EFI is reference counted so that it is not freed prior to both the EFI
- * and EFD being committed and unpinned. This ensures that when the last
- * reference goes away the EFI will always be in the AIL as it has been
- * unpinned, regardless of whether the EFD is processed before or after the EFI.
+ * and EFD being committed and unpinned. This ensures the EFI is inserted into
+ * the AIL even in the event of out of order EFI/EFD processing. In other words,
+ * an EFI is born with two references:
+ *
+ * 1.) an EFI held reference to track EFI AIL insertion
+ * 2.) an EFD held reference to track EFD commit
+ *
+ * On allocation, both references are the responsibility of the caller. Once the
+ * EFI is added to and dirtied in a transaction, ownership of reference one
+ * transfers to the transaction. The reference is dropped once the EFI is
+ * inserted to the AIL or in the event of failure along the way (e.g., commit
+ * failure, log I/O error, etc.). Note that the caller remains responsible for
+ * the EFD reference under all circumstances to this point. The caller has no
+ * means to detect failure once the transaction is committed, however.
+ * Therefore, an EFD is required after this point, even in the event of
+ * unrelated failure.
+ *
+ * Once an EFD is allocated and dirtied in a transaction, reference two
+ * transfers to the transaction. The EFD reference is dropped once it reaches
+ * the unpin handler. Similar to the EFI, the reference also drops in the event
+ * of commit failure or log I/O errors. Note that the EFD is not inserted in the
+ * AIL, so at this point both the EFI and EFD are freed.
*/
typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item;
@@ -77,5 +96,6 @@ xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
int xfs_efi_copy_format(xfs_log_iovec_t *buf,
xfs_efi_log_format_t *dst_efi_fmt);
void xfs_efi_item_free(xfs_efi_log_item_t *);
+void xfs_efi_release(struct xfs_efi_log_item *);
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/kernel/fs/xfs/xfs_file.c b/kernel/fs/xfs/xfs_file.c
index 3b7591224..f5392ab2d 100644
--- a/kernel/fs/xfs/xfs_file.c
+++ b/kernel/fs/xfs/xfs_file.c
@@ -41,6 +41,7 @@
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -79,14 +80,15 @@ xfs_rw_ilock_demote(
}
/*
- * xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
*
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
*/
int
xfs_iozero(
@@ -96,7 +98,8 @@ xfs_iozero(
{
struct page *page;
struct address_space *mapping;
- int status;
+ int status = 0;
+
mapping = VFS_I(ip)->i_mapping;
do {
@@ -108,20 +111,27 @@ xfs_iozero(
if (bytes > count)
bytes = count;
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
+ if (IS_DAX(VFS_I(ip))) {
+ status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+ xfs_get_blocks_direct);
+ if (status)
+ break;
+ } else {
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
- zero_user(page, offset, bytes);
+ zero_user(page, offset, bytes);
- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
- page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
+ status = pagecache_write_end(NULL, mapping, pos, bytes,
+ bytes, page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ status = 0;
+ }
pos += bytes;
count -= bytes;
- status = 0;
} while (count);
return status;
@@ -138,7 +148,7 @@ xfs_update_prealloc_flags(
tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -160,7 +170,7 @@ xfs_update_prealloc_flags(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (flags & XFS_PREALLOC_SYNC)
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
/*
@@ -232,19 +242,30 @@ xfs_file_fsync(
}
/*
- * All metadata updates are logged, which means that we just have
- * to flush the log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to
+ * flush the log up to the latest LSN that touched the inode. If we have
+ * concurrent fsync/fdatasync() calls, we need them to all block on the
+ * log force before we clear the ili_fsync_fields field. This ensures
+ * that we don't get a racing sync operation that does not wait for the
+ * metadata to hit the journal before returning. If we race with
+ * clearing the ili_fsync_fields, then all that will happen is the log
+ * force will do nothing as the lsn will already be on disk. We can't
+ * race with setting ili_fsync_fields because that is done under
+ * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+ * until after the ili_fsync_fields is cleared.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
if (!datasync ||
- (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
}
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (lsn)
+ if (lsn) {
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+ ip->i_itemp->ili_fsync_fields = 0;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
/*
* If we only have a single device, and the log force about was
@@ -277,14 +298,14 @@ xfs_file_read_iter(
xfs_fsize_t n;
loff_t pos = iocb->ki_pos;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(mp, xs_read_calls);
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
ioflags |= XFS_IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
- if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+ if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -307,24 +328,33 @@ xfs_file_read_iter(
return -EIO;
/*
- * Locking is a bit tricky here. If we take an exclusive lock
- * for direct IO, we effectively serialise all new concurrent
- * read IO to this file and block it behind IO that is currently in
- * progress because IO in progress holds the IO lock shared. We only
- * need to hold the lock exclusive to blow away the page cache, so
- * only take lock exclusively if the page cache needs invalidation.
- * This allows the normal direct IO case of no page cache pages to
- * proceeed concurrently without serialisation.
+ * Locking is a bit tricky here. If we take an exclusive lock for direct
+ * IO, we effectively serialise all new concurrent read IO to this file
+ * and block it behind IO that is currently in progress because IO in
+ * progress holds the IO lock shared. We only need to hold the lock
+ * exclusive to blow away the page cache, so only take lock exclusively
+ * if the page cache needs invalidation. This allows the normal direct
+ * IO case of no page cache pages to proceeed concurrently without
+ * serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+ /*
+ * The generic dio code only flushes the range of the particular
+ * I/O. Because we take an exclusive lock here, this whole
+ * sequence is considerably more expensive for us. This has a
+ * noticeable performance impact for any file with cached pages,
+ * even when outside of the range of the particular I/O.
+ *
+ * Hence, amortize the cost of the lock against a full file
+ * flush and reduce the chances of repeated iolock cycles going
+ * forward.
+ */
if (inode->i_mapping->nrpages) {
- ret = filemap_write_and_wait_range(
- VFS_I(ip)->i_mapping,
- pos, pos + size - 1);
+ ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
@@ -335,9 +365,7 @@ xfs_file_read_iter(
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
- ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + size - 1) >> PAGE_CACHE_SHIFT);
+ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -348,7 +376,7 @@ xfs_file_read_iter(
ret = generic_file_read_iter(iocb, to);
if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -366,7 +394,7 @@ xfs_file_splice_read(
int ioflags = 0;
ssize_t ret;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(ip->i_mount, xs_read_calls);
if (infilp->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
@@ -378,9 +406,13 @@ xfs_file_splice_read(
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ /* for dax, we need to avoid the page cache */
+ if (IS_DAX(VFS_I(ip)))
+ ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+ else
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -461,6 +493,8 @@ xfs_zero_eof(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
+ trace_xfs_zero_eof(ip, isize, offset - isize);
+
/*
* First handle zeroing the block on which isize resides.
*
@@ -553,6 +587,7 @@ xfs_file_aio_write_checks(
struct xfs_inode *ip = XFS_I(inode);
ssize_t error = 0;
size_t count = iov_iter_count(from);
+ bool drained_dio = false;
restart:
error = generic_write_checks(iocb, from);
@@ -563,6 +598,13 @@ restart:
if (error)
return error;
+ /* For changing security info in file_remove_privs() we need i_mutex */
+ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ goto restart;
+ }
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
@@ -583,12 +625,13 @@ restart:
bool zero = false;
spin_unlock(&ip->i_flags_lock);
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
-
+ if (!drained_dio) {
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
/*
* We now have an IO submission barrier in place, but
* AIO can do EOF updates during IO completion and hence
@@ -598,6 +641,7 @@ restart:
* no-op.
*/
inode_dio_wait(inode);
+ drained_dio = true;
goto restart;
}
error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@ -623,7 +667,9 @@ restart:
* setgid bits if the process is not being run by root. This keeps
* people from modifying setuid and setgid binaries.
*/
- return file_remove_suid(file);
+ if (!IS_NOSEC(inode))
+ return file_remove_privs(file);
+ return 0;
}
/*
@@ -672,7 +718,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if ((pos | count) & target->bt_logical_sectormask)
+ if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
@@ -710,19 +756,19 @@ xfs_file_dio_aio_write(
pos = iocb->ki_pos;
end = pos + count - 1;
+ /*
+ * See xfs_file_read_iter() for why we do a full-file flush here.
+ */
if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, end);
+ ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (ret)
goto out;
/*
- * Invalidate whole pages. This can return an error if
- * we fail to invalidate a page, but this should never
- * happen on XFS. Warn if it does fail.
+ * Invalidate whole pages. This can return an error if we fail
+ * to invalidate a page, but this should never happen on XFS.
+ * Warn if it does fail.
*/
- ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- end >> PAGE_CACHE_SHIFT);
+ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -758,8 +804,11 @@ xfs_file_dio_aio_write(
out:
xfs_rw_iunlock(ip, iolock);
- /* No fallback to buffered IO on errors for XFS. */
- ASSERT(ret < 0 || ret == count);
+ /*
+ * No fallback to buffered IO on errors for XFS. DAX can result in
+ * partial writes, but direct IO will either complete fully or fail.
+ */
+ ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
return ret;
}
@@ -834,7 +883,7 @@ xfs_file_write_iter(
ssize_t ret;
size_t ocount = iov_iter_count(from);
- XFS_STATS_INC(xs_write_calls);
+ XFS_STATS_INC(ip->i_mount, xs_write_calls);
if (ocount == 0)
return 0;
@@ -842,7 +891,7 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
@@ -850,7 +899,7 @@ xfs_file_write_iter(
if (ret > 0) {
ssize_t err;
- XFS_STATS_ADD(xs_write_bytes, ret);
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@ -1063,17 +1112,6 @@ xfs_file_readdir(
return xfs_readdir(ip, ctx, bufsize);
}
-STATIC int
-xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
-{
- vma->vm_ops = &xfs_file_vm_ops;
-
- file_accessed(filp);
- return 0;
-}
-
/*
* This type is designed to indicate the type of offset we would like
* to search from page cache for xfs_seek_hole_data().
@@ -1454,48 +1492,166 @@ xfs_file_llseek(
* ordering of:
*
* mmap_sem (MM)
- * i_mmap_lock (XFS - truncate serialisation)
- * page_lock (MM)
- * i_lock (XFS - extent map serialisation)
+ * sb_start_pagefault(vfs, freeze)
+ * i_mmaplock (XFS - truncate serialisation)
+ * page_lock (MM)
+ * i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
*/
STATIC int
+xfs_filemap_page_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret;
+
+ trace_xfs_filemap_page_mkwrite(XFS_I(inode));
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ if (IS_DAX(inode)) {
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ } else {
+ ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite_return(ret);
+ }
+
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+STATIC int
xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret;
- trace_xfs_filemap_fault(ip);
+ trace_xfs_filemap_fault(XFS_I(inode));
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = filemap_fault(vma, vmf);
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ /* DAX can shortcut the normal fault path on write faults! */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
+ return xfs_filemap_page_mkwrite(vma, vmf);
- return error;
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ if (IS_DAX(inode)) {
+ /*
+ * we do not want to trigger unwritten extent conversion on read
+ * faults - that is unnecessary overhead and would also require
+ * changes to xfs_get_blocks_direct() to map unwritten extent
+ * ioend for conversion on read-only mappings.
+ */
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ } else
+ ret = filemap_fault(vma, vmf);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ return ret;
}
/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
*/
STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_pmd_fault(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret;
+
+ if (!IS_DAX(inode))
+ return VM_FAULT_FALLBACK;
+
+ trace_xfs_filemap_pmd_fault(ip);
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+ NULL);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
- trace_xfs_filemap_page_mkwrite(ip);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ trace_xfs_filemap_pfn_mkwrite(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ /* check if the faulting page hasn't raced with truncate */
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
- return error;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = xfs_filemap_fault,
+ .pmd_fault = xfs_filemap_pmd_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = xfs_filemap_page_mkwrite,
+ .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+ struct file *filp,
+ struct vm_area_struct *vma)
+{
+ file_accessed(filp);
+ vma->vm_ops = &xfs_file_vm_ops;
+ if (IS_DAX(file_inode(filp)))
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ return 0;
}
const struct file_operations xfs_file_operations = {
@@ -1526,9 +1682,3 @@ const struct file_operations xfs_dir_file_operations = {
#endif
.fsync = xfs_dir_fsync,
};
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
- .fault = xfs_filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = xfs_filemap_page_mkwrite,
-};
diff --git a/kernel/fs/xfs/xfs_filestream.c b/kernel/fs/xfs/xfs_filestream.c
index da82f1cb4..c4c130f9b 100644
--- a/kernel/fs/xfs/xfs_filestream.c
+++ b/kernel/fs/xfs/xfs_filestream.c
@@ -196,7 +196,8 @@ xfs_filestream_pick_ag(
goto next_ag;
}
- longest = xfs_alloc_longest_free_extent(mp, pag);
+ longest = xfs_alloc_longest_free_extent(mp, pag,
+ xfs_alloc_min_freelist(mp, pag));
if (((minlen && longest >= minlen) ||
(!minlen && pag->pagf_freeblks >= minfree)) &&
(!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/kernel/fs/xfs/xfs_fsops.c b/kernel/fs/xfs/xfs_fsops.c
index cb7e8a29d..ee3aaa0a5 100644
--- a/kernel/fs/xfs/xfs_fsops.c
+++ b/kernel/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
(xfs_sb_version_hasftype(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
(xfs_sb_version_hasfinobt(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+ XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
+ (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -201,7 +203,7 @@ xfs_growfs_data_private(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
XFS_GROWFS_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -248,7 +250,7 @@ xfs_growfs_data_private(
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
@@ -271,7 +273,7 @@ xfs_growfs_data_private(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
agfl->agfl_seqno = cpu_to_be32(agno);
- uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
}
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
@@ -307,7 +309,7 @@ xfs_growfs_data_private(
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
agi->agi_free_level = cpu_to_be32(1);
@@ -489,7 +491,7 @@ xfs_growfs_data_private(
if (dpct)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
return error;
@@ -557,7 +559,7 @@ xfs_growfs_data_private(
return saved_error ? saved_error : error;
error0:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/kernel/fs/xfs/xfs_icache.c b/kernel/fs/xfs/xfs_icache.c
index 76a9f2783..d7a490f24 100644
--- a/kernel/fs/xfs/xfs_icache.c
+++ b/kernel/fs/xfs/xfs_icache.c
@@ -63,7 +63,7 @@ xfs_inode_alloc(
return NULL;
}
- XFS_STATS_INC(vn_active);
+ XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
@@ -129,7 +129,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!xfs_isiflocked(ip));
- XFS_STATS_DEC(vn_active);
+ XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
@@ -159,7 +159,7 @@ xfs_iget_cache_hit(
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -177,7 +177,7 @@ xfs_iget_cache_hit(
*/
if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -259,7 +259,7 @@ xfs_iget_cache_hit(
xfs_ilock(ip, lock_flags);
xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
- XFS_STATS_INC(xs_ig_found);
+ XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -342,7 +342,7 @@ xfs_iget_cache_miss(
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
- XFS_STATS_INC(xs_ig_dup);
+ XFS_STATS_INC(mp, xs_ig_dup);
error = -EAGAIN;
goto out_preload_end;
}
@@ -412,6 +412,8 @@ xfs_iget(
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return -EINVAL;
+ XFS_STATS_INC(mp, xs_ig_attempts);
+
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
agino = XFS_INO_TO_AGINO(mp, ino);
@@ -427,7 +429,7 @@ again:
goto out_error_or_again;
} else {
rcu_read_unlock();
- XFS_STATS_INC(xs_ig_missed);
+ XFS_STATS_INC(mp, xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
flags, lock_flags);
@@ -963,7 +965,7 @@ reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- XFS_STATS_INC(xs_ig_reclaims);
+ XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
/*
* Remove the inode from the per-AG radix tree.
*
diff --git a/kernel/fs/xfs/xfs_inode.c b/kernel/fs/xfs/xfs_inode.c
index fec4bfba0..8ee393996 100644
--- a/kernel/fs/xfs/xfs_inode.c
+++ b/kernel/fs/xfs/xfs_inode.c
@@ -363,6 +363,23 @@ int xfs_lock_delays;
#endif
/*
+ * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
+ * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
+ * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
+ * errors and warnings.
+ */
+#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
+static bool
+xfs_lockdep_subclass_ok(
+ int subclass)
+{
+ return subclass < MAX_LOCKDEP_SUBCLASSES;
+}
+#else
+#define xfs_lockdep_subclass_ok(subclass) (true)
+#endif
+
+/*
* Bump the subclass so xfs_lock_inodes() acquires each lock with a different
* value. This can be called for any type of inode lock combination, including
* parent locking. Care must be taken to ensure we don't overrun the subclass
@@ -375,11 +392,12 @@ xfs_lock_inumorder(int lock_mode, int subclass)
ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
XFS_ILOCK_RTSUM)));
+ ASSERT(xfs_lockdep_subclass_ok(subclass));
if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
- ASSERT(subclass + XFS_IOLOCK_PARENT_VAL <
- MAX_LOCKDEP_SUBCLASSES);
+ ASSERT(xfs_lockdep_subclass_ok(subclass +
+ XFS_IOLOCK_PARENT_VAL));
class += subclass << XFS_IOLOCK_SHIFT;
if (lock_mode & XFS_IOLOCK_PARENT)
class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
@@ -663,30 +681,29 @@ xfs_lookup(
{
xfs_ino_t inum;
int error;
- uint lock_mode;
trace_xfs_lookup(dp, name);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- lock_mode = xfs_ilock_data_map_shared(dp);
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
- xfs_iunlock(dp, lock_mode);
-
if (error)
- goto out;
+ goto out_unlock;
error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
if (error)
goto out_free_name;
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return 0;
out_free_name:
if (ci_name)
kmem_free(ci_name->name);
-out:
+out_unlock:
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
*ipp = NULL;
return error;
}
@@ -821,7 +838,7 @@ xfs_ialloc(
if (ip->i_d.di_version == 3) {
ASSERT(ip->i_d.di_ino == ino);
- ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
ip->i_d.di_crc = 0;
ip->i_d.di_changecount = 1;
ip->i_d.di_lsn = 0;
@@ -939,7 +956,6 @@ xfs_dir_ialloc(
{
xfs_trans_t *tp;
- xfs_trans_t *ntp;
xfs_inode_t *ip;
xfs_buf_t *ialloc_context = NULL;
int code;
@@ -988,8 +1004,6 @@ xfs_dir_ialloc(
* to succeed the second time.
*/
if (ialloc_context) {
- struct xfs_trans_res tres;
-
/*
* Normally, xfs_trans_commit releases all the locks.
* We call bhold to hang on to the ialloc_context across
@@ -998,12 +1012,6 @@ xfs_dir_ialloc(
* allocation group.
*/
xfs_trans_bhold(tp, ialloc_context);
- /*
- * Save the log reservation so we can use
- * them in the next transaction.
- */
- tres.tr_logres = xfs_trans_get_log_res(tp);
- tres.tr_logcount = xfs_trans_get_log_count(tp);
/*
* We want the quota changes to be associated with the next
@@ -1019,35 +1027,9 @@ xfs_dir_ialloc(
tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
}
- ntp = xfs_trans_dup(tp);
- code = xfs_trans_commit(tp, 0);
- tp = ntp;
- if (committed != NULL) {
+ code = xfs_trans_roll(&tp, 0);
+ if (committed != NULL)
*committed = 1;
- }
- /*
- * If we get an error during the commit processing,
- * release the buffer that is still held and return
- * to the caller.
- */
- if (code) {
- xfs_buf_relse(ialloc_context);
- if (dqinfo) {
- tp->t_dqinfo = dqinfo;
- xfs_trans_free_dqinfo(tp);
- }
- *tpp = ntp;
- *ipp = NULL;
- return code;
- }
-
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- code = xfs_trans_reserve(tp, &tres, 0, 0);
/*
* Re-attach the quota info that we detached from prev trx.
@@ -1059,7 +1041,7 @@ xfs_dir_ialloc(
if (code) {
xfs_buf_relse(ialloc_context);
- *tpp = ntp;
+ *tpp = tp;
*ipp = NULL;
return code;
}
@@ -1161,7 +1143,6 @@ xfs_create(
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- uint cancel_flags;
int committed;
prid_t prid;
struct xfs_dquot *udqp = NULL;
@@ -1198,8 +1179,6 @@ xfs_create(
tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
}
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
/*
* Initially assume that the file does not exist and
* reserve the resources for that case. If that is not
@@ -1217,12 +1196,12 @@ xfs_create(
resblks = 0;
error = xfs_trans_reserve(tp, tres, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+ XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
xfs_bmap_init(&free_list, &first_block);
@@ -1248,11 +1227,8 @@ xfs_create(
*/
error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
prid, resblks > 0, &ip, &committed);
- if (error) {
- if (error == -ENOSPC)
- goto out_trans_cancel;
- goto out_trans_abort;
- }
+ if (error)
+ goto out_trans_cancel;
/*
* Now we join the directory inode to the transaction. We do not do it
@@ -1261,7 +1237,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1269,7 +1245,7 @@ xfs_create(
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
ASSERT(error != -ENOSPC);
- goto out_trans_abort;
+ goto out_trans_cancel;
}
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -1303,7 +1279,7 @@ xfs_create(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -1316,10 +1292,8 @@ xfs_create(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
@@ -1336,7 +1310,7 @@ xfs_create(
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return error;
}
@@ -1351,7 +1325,6 @@ xfs_create_tmpfile(
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
int error;
- uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1384,10 +1357,8 @@ xfs_create_tmpfile(
resblks = 0;
error = xfs_trans_reserve(tp, tres, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
pdqp, resblks, 1, 0);
@@ -1396,11 +1367,8 @@ xfs_create_tmpfile(
error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
prid, resblks > 0, &ip, NULL);
- if (error) {
- if (error == -ENOSPC)
- goto out_trans_cancel;
- goto out_trans_abort;
- }
+ if (error)
+ goto out_trans_cancel;
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -1415,9 +1383,9 @@ xfs_create_tmpfile(
ip->i_d.di_nlink--;
error = xfs_iunlink(tp, ip);
if (error)
- goto out_trans_abort;
+ goto out_trans_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -1428,10 +1396,8 @@ xfs_create_tmpfile(
*ipp = ip;
return 0;
- out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
@@ -1461,7 +1427,6 @@ xfs_link(
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int cancel_flags;
int committed;
int resblks;
@@ -1481,22 +1446,20 @@ xfs_link(
goto std_return;
tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
if (error == -ENOSPC) {
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto error_return;
- }
+ xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow hard link
@@ -1520,19 +1483,19 @@ xfs_link(
if (sip->i_d.di_nlink == 0) {
error = xfs_iunlink_remove(tp, sip);
if (error)
- goto abort_return;
+ goto error_return;
}
error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
&first_block, &free_list, resblks);
if (error)
- goto abort_return;
+ goto error_return;
xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
error = xfs_bumplink(tp, sip);
if (error)
- goto abort_return;
+ goto error_return;
/*
* If this is a synchronous mount, make sure that the
@@ -1546,15 +1509,13 @@ xfs_link(
error = xfs_bmap_finish (&tp, &free_list, &committed);
if (error) {
xfs_bmap_cancel(&free_list);
- goto abort_return;
+ goto error_return;
}
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return xfs_trans_commit(tp);
- abort_return:
- cancel_flags |= XFS_TRANS_ABORT;
error_return:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
std_return:
return error;
}
@@ -1589,7 +1550,6 @@ xfs_itruncate_extents(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
- struct xfs_trans *ntp;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
xfs_fileoff_t first_unmap_block;
@@ -1647,29 +1607,7 @@ xfs_itruncate_extents(
if (error)
goto out_bmap_cancel;
- if (committed) {
- /*
- * Mark the inode dirty so it will be logged and
- * moved forward in the log as part of every commit.
- */
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
-
- ntp = xfs_trans_dup(tp);
- error = xfs_trans_commit(tp, 0);
- tp = ntp;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- if (error)
- goto out;
-
- /*
- * Transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_roll(&tp, ip);
if (error)
goto out;
}
@@ -1790,7 +1728,7 @@ xfs_inactive_truncate(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1811,7 +1749,7 @@ xfs_inactive_truncate(
ASSERT(ip->i_d.di_nextents == 0);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error_unlock;
@@ -1819,7 +1757,7 @@ xfs_inactive_truncate(
return 0;
error_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -1869,7 +1807,7 @@ xfs_inactive_ifree(
} else {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
}
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1889,7 +1827,7 @@ xfs_inactive_ifree(
__func__, error);
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
}
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1900,15 +1838,16 @@ xfs_inactive_ifree(
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
/*
- * Just ignore errors at this point. There is nothing we can
- * do except to try to keep going. Make sure it's not a silent
- * error.
+ * Just ignore errors at this point. There is nothing we can do except
+ * to try to keep going. Make sure it's not a silent error.
*/
error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error)
+ if (error) {
xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
__func__, error);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_bmap_cancel(&free_list);
+ }
+ error = xfs_trans_commit(tp);
if (error)
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
__func__, error);
@@ -2269,28 +2208,42 @@ xfs_iunlink_remove(
*/
STATIC int
xfs_ifree_cluster(
- xfs_inode_t *free_ip,
- xfs_trans_t *tp,
- xfs_ino_t inum)
+ xfs_inode_t *free_ip,
+ xfs_trans_t *tp,
+ struct xfs_icluster *xic)
{
xfs_mount_t *mp = free_ip->i_mount;
int blks_per_cluster;
int inodes_per_cluster;
int nbufs;
int i, j;
+ int ioffset;
xfs_daddr_t blkno;
xfs_buf_t *bp;
xfs_inode_t *ip;
xfs_inode_log_item_t *iip;
xfs_log_item_t *lip;
struct xfs_perag *pag;
+ xfs_ino_t inum;
+ inum = xic->first_ino;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
blks_per_cluster = xfs_icluster_size_fsb(mp);
inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
nbufs = mp->m_ialloc_blks / blks_per_cluster;
for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+ /*
+ * The allocation bitmap tells us which inodes of the chunk were
+ * physically allocated. Skip the cluster if an inode falls into
+ * a sparse region.
+ */
+ ioffset = inum - xic->first_ino;
+ if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+ ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
+ continue;
+ }
+
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
@@ -2412,6 +2365,7 @@ retry:
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -2448,8 +2402,7 @@ xfs_ifree(
xfs_bmap_free_t *flist)
{
int error;
- int delete;
- xfs_ino_t first_ino;
+ struct xfs_icluster xic = { 0 };
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
@@ -2465,7 +2418,7 @@ xfs_ifree(
if (error)
return error;
- error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+ error = xfs_difree(tp, ip->i_ino, flist, &xic);
if (error)
return error;
@@ -2482,8 +2435,8 @@ xfs_ifree(
ip->i_d.di_gen++;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (delete)
- error = xfs_ifree_cluster(ip, tp, first_ino);
+ if (xic.deleted)
+ error = xfs_ifree_cluster(ip, tp, &xic);
return error;
}
@@ -2570,7 +2523,6 @@ xfs_remove(
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int cancel_flags;
int committed;
uint resblks;
@@ -2591,7 +2543,6 @@ xfs_remove(
tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
else
tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* We try to get the real space reservation first,
@@ -2610,19 +2561,18 @@ xfs_remove(
}
if (error) {
ASSERT(error != -ENOSPC);
- cancel_flags = 0;
goto out_trans_cancel;
}
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
* If we're removing a directory perform some additional validation.
*/
- cancel_flags |= XFS_TRANS_ABORT;
if (is_dir) {
ASSERT(ip->i_d.di_nlink >= 2);
if (ip->i_d.di_nlink != 2) {
@@ -2678,7 +2628,7 @@ xfs_remove(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto std_return;
@@ -2690,7 +2640,7 @@ xfs_remove(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
std_return:
return error;
}
@@ -2764,11 +2714,11 @@ xfs_finish_rename(
error = xfs_bmap_finish(&tp, free_list, &committed);
if (error) {
xfs_bmap_cancel(free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return xfs_trans_commit(tp);
}
/*
@@ -2889,7 +2839,7 @@ xfs_cross_rename(
out_trans_abort:
xfs_bmap_cancel(free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
@@ -2949,7 +2899,6 @@ xfs_rename(
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
- int cancel_flags = 0;
int spaceres;
int error;
@@ -2985,7 +2934,6 @@ xfs_rename(
}
if (error)
goto out_trans_cancel;
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* Attach the dquots to the inodes
@@ -3000,6 +2948,12 @@ xfs_rename(
* whether the target directory is the same as the source
* directory, we can lock from 2 to 4 inodes.
*/
+ if (!new_parent)
+ xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+ else
+ xfs_lock_two_inodes(src_dp, target_dp,
+ XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
@@ -3007,9 +2961,9 @@ xfs_rename(
* we can rely on either trans_commit or trans_cancel to unlock
* them.
*/
- xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
@@ -3056,10 +3010,8 @@ xfs_rename(
error = xfs_dir_createname(tp, target_dp, target_name,
src_ip->i_ino, &first_block,
&free_list, spaceres);
- if (error == -ENOSPC)
- goto out_bmap_cancel;
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
xfs_trans_ichgtime(tp, target_dp,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3067,7 +3019,7 @@ xfs_rename(
if (new_parent && src_is_directory) {
error = xfs_bumplink(tp, target_dp);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
} else { /* target_ip != NULL */
/*
@@ -3099,7 +3051,7 @@ xfs_rename(
src_ip->i_ino,
&first_block, &free_list, spaceres);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
xfs_trans_ichgtime(tp, target_dp,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3110,7 +3062,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, target_ip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
if (src_is_directory) {
/*
@@ -3118,7 +3070,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, target_ip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
} /* target_ip != NULL */
@@ -3135,7 +3087,7 @@ xfs_rename(
&first_block, &free_list, spaceres);
ASSERT(error != -EEXIST);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
/*
@@ -3161,7 +3113,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, src_dp);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
}
/*
@@ -3176,7 +3128,7 @@ xfs_rename(
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
&first_block, &free_list, spaceres);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
/*
* For whiteouts, we need to bump the link count on the whiteout inode.
@@ -3190,10 +3142,10 @@ xfs_rename(
ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
error = xfs_bumplink(tp, wip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
error = xfs_iunlink_remove(tp, wip);
if (error)
- goto out_trans_abort;
+ goto out_bmap_cancel;
xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
/*
@@ -3214,12 +3166,10 @@ xfs_rename(
IRELE(wip);
return error;
-out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
if (wip)
IRELE(wip);
return error;
@@ -3322,8 +3272,8 @@ xfs_iflush_cluster(
}
if (clcount) {
- XFS_STATS_INC(xs_icluster_flushcnt);
- XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
}
out_free:
@@ -3396,7 +3346,7 @@ xfs_iflush(
struct xfs_dinode *dip;
int error;
- XFS_STATS_INC(xs_iflush_count);
+ XFS_STATS_INC(mp, xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
@@ -3498,7 +3448,7 @@ xfs_iflush_int(
ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
@@ -3611,6 +3561,7 @@ xfs_iflush_int(
*/
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
diff --git a/kernel/fs/xfs/xfs_inode.h b/kernel/fs/xfs/xfs_inode.h
index ee26a603c..ca9e11989 100644
--- a/kernel/fs/xfs/xfs_inode.h
+++ b/kernel/fs/xfs/xfs_inode.h
@@ -328,7 +328,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
* 5 PARENT subclass (not nestable)
* 6 RTBITMAP subclass (not nestable)
* 7 RTSUM subclass (not nestable)
- *
+ *
*/
#define XFS_IOLOCK_SHIFT 16
#define XFS_IOLOCK_PARENT_VAL 4
diff --git a/kernel/fs/xfs/xfs_inode_item.c b/kernel/fs/xfs/xfs_inode_item.c
index bf13a5a7e..d14b12b8c 100644
--- a/kernel/fs/xfs/xfs_inode_item.c
+++ b/kernel/fs/xfs/xfs_inode_item.c
@@ -703,17 +703,10 @@ xfs_iflush_abort(
xfs_inode_log_item_t *iip = ip->i_itemp;
if (iip) {
- struct xfs_ail *ailp = iip->ili_item.li_ailp;
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- spin_lock(&ailp->xa_lock);
- if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &iip->ili_item,
- stale ?
- SHUTDOWN_LOG_IO_ERROR :
+ xfs_trans_ail_remove(&iip->ili_item,
+ stale ? SHUTDOWN_LOG_IO_ERROR :
SHUTDOWN_CORRUPT_INCORE);
- } else
- spin_unlock(&ailp->xa_lock);
}
iip->ili_logged = 0;
/*
@@ -726,6 +719,7 @@ xfs_iflush_abort(
* attempted.
*/
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
}
/*
* Release the inode's flush lock since we're done with it.
diff --git a/kernel/fs/xfs/xfs_inode_item.h b/kernel/fs/xfs/xfs_inode_item.h
index 488d81254..4c7722e32 100644
--- a/kernel/fs/xfs/xfs_inode_item.h
+++ b/kernel/fs/xfs/xfs_inode_item.h
@@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item {
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
+ unsigned int ili_fsync_fields; /* logged since last fsync */
} xfs_inode_log_item_t;
static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/kernel/fs/xfs/xfs_ioctl.c b/kernel/fs/xfs/xfs_ioctl.c
index 87f67c6b6..d42738dee 100644
--- a/kernel/fs/xfs/xfs_ioctl.c
+++ b/kernel/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_pnfs.h"
+#include "xfs_acl.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -336,7 +337,7 @@ xfs_set_dmattrs(
tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -346,7 +347,7 @@ xfs_set_dmattrs(
ip->i_d.di_dmstate = state;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
return error;
}
@@ -411,7 +412,7 @@ xfs_attrlist_by_handle(
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
@@ -455,7 +456,7 @@ xfs_attrmulti_attr_get(
unsigned char *kbuf;
int error = -EFAULT;
- if (*len > XATTR_SIZE_MAX)
+ if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = kmem_zalloc_large(*len, KM_SLEEP);
if (!kbuf)
@@ -482,17 +483,22 @@ xfs_attrmulti_attr_set(
__uint32_t flags)
{
unsigned char *kbuf;
+ int error;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XATTR_SIZE_MAX)
+ if (len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = memdup_user(ubuf, len);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ kfree(kbuf);
+ return error;
}
int
@@ -501,9 +507,14 @@ xfs_attrmulti_attr_remove(
unsigned char *name,
__uint32_t flags)
{
+ int error;
+
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- return xfs_attr_remove(XFS_I(inode), name, flags);
+ error = xfs_attr_remove(XFS_I(inode), name, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ return error;
}
STATIC int
@@ -1028,7 +1039,7 @@ xfs_ioctl_setattr_xflags(
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
return 0;
}
@@ -1076,7 +1087,7 @@ xfs_ioctl_setattr_get_trans(
return tp;
out_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return ERR_PTR(error);
}
@@ -1253,7 +1264,7 @@ xfs_ioctl_setattr(
else
ip->i_d.di_extsize = 0;
- code = xfs_trans_commit(tp, 0);
+ code = xfs_trans_commit(tp);
/*
* Release any dquot(s) the inode had kept before chown.
@@ -1265,7 +1276,7 @@ xfs_ioctl_setattr(
return code;
error_trans_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
error_free_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(pdqp);
@@ -1338,11 +1349,11 @@ xfs_ioc_setxflags(
error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_drop_write;
}
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_drop_write:
mnt_drop_write_file(filp);
return error;
diff --git a/kernel/fs/xfs/xfs_ioctl32.c b/kernel/fs/xfs/xfs_ioctl32.c
index b88bdc85d..1a05d8ae3 100644
--- a/kernel/fs/xfs/xfs_ioctl32.c
+++ b/kernel/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle(
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
diff --git a/kernel/fs/xfs/xfs_iomap.c b/kernel/fs/xfs/xfs_iomap.c
index 38e633bad..f4f5b43cf 100644
--- a/kernel/fs/xfs/xfs_iomap.c
+++ b/kernel/fs/xfs/xfs_iomap.c
@@ -131,20 +131,30 @@ xfs_iomap_write_direct(
uint qblocks, resblks, resrtextents;
int committed;
int error;
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
+ int lockmode;
+ int bmapi_flags = XFS_BMAPI_PREALLOC;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
+ lockmode = XFS_ILOCK_SHARED; /* locked by caller */
+
+ ASSERT(xfs_isilocked(ip, lockmode));
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
if ((offset + count) > XFS_ISIZE(ip)) {
+ /*
+ * Assert that the in-core extent list is present since this can
+ * call xfs_iread_extents() and we only have the ilock shared.
+ * This should be safe because the lock was held around a bmapi
+ * call in the caller and we only need it to access the in-core
+ * list.
+ */
+ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
+ XFS_IFEXTENTS);
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- return error;
+ goto out_unlock;
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -174,20 +184,47 @@ xfs_iomap_write_direct(
}
/*
+ * Drop the shared lock acquired by the caller, attach the dquot if
+ * necessary and move on to transaction setup.
+ */
+ xfs_iunlock(ip, lockmode);
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
* Allocate and setup the transaction
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+ /*
+ * For DAX, we do not allocate unwritten extents, but instead we zero
+ * the block before we commit the transaction. Ideally we'd like to do
+ * this outside the transaction context, but if we commit and then crash
+ * we may not have zeroed the blocks and this will be exposed on
+ * recovery of the allocation. Hence we must zero before commit.
+ * Further, if we are mapping unwritten extents here, we need to zero
+ * and convert them to written so that we don't need an unwritten extent
+ * callback for DAX. This also means that we need to be able to dip into
+ * the reserve block pool if there is no space left but we need to do
+ * unwritten extent conversion.
+ */
+ if (IS_DAX(VFS_I(ip))) {
+ bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ }
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, resrtextents);
/*
* Check for running out of space, note: need lock to return
*/
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
@@ -202,8 +239,8 @@ xfs_iomap_write_direct(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_PREALLOC, &firstfsb, 0,
- imap, &nimaps, &free_list);
+ bmapi_flags, &firstfsb, resblks, imap,
+ &nimaps, &free_list);
if (error)
goto out_bmap_cancel;
@@ -213,7 +250,8 @@ xfs_iomap_write_direct(
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
@@ -229,14 +267,14 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
out_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -670,7 +708,7 @@ xfs_iomap_write_allocate(
count_fsb = imap->br_blockcount;
map_start_fsb = imap->br_startoff;
- XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
while (count_fsb != 0) {
/*
@@ -690,7 +728,7 @@ xfs_iomap_write_allocate(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
nres, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -750,9 +788,9 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0,
- &first_block, 1,
- imap, &nimaps, &free_list);
+ count_fsb, 0, &first_block,
+ nres, imap, &nimaps,
+ &free_list);
if (error)
goto trans_cancel;
@@ -760,7 +798,7 @@ xfs_iomap_write_allocate(
if (error)
goto trans_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error0;
@@ -777,7 +815,7 @@ xfs_iomap_write_allocate(
if ((offset_fsb >= imap->br_startoff) &&
(offset_fsb < (imap->br_startoff +
imap->br_blockcount))) {
- XFS_STATS_INC(xs_xstrat_quick);
+ XFS_STATS_INC(mp, xs_xstrat_quick);
return 0;
}
@@ -791,7 +829,7 @@ xfs_iomap_write_allocate(
trans_cancel:
xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error0:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -853,7 +891,7 @@ xfs_iomap_write_unwritten(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -866,8 +904,8 @@ xfs_iomap_write_unwritten(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_CONVERT, &firstfsb,
- 1, &imap, &nimaps, &free_list);
+ XFS_BMAPI_CONVERT, &firstfsb, resblks,
+ &imap, &nimaps, &free_list);
if (error)
goto error_on_bmapi_transaction;
@@ -890,7 +928,7 @@ xfs_iomap_write_unwritten(
if (error)
goto error_on_bmapi_transaction;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
@@ -914,7 +952,7 @@ xfs_iomap_write_unwritten(
error_on_bmapi_transaction:
xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
diff --git a/kernel/fs/xfs/xfs_iops.c b/kernel/fs/xfs/xfs_iops.c
index f4cd7204e..245268a0c 100644
--- a/kernel/fs/xfs/xfs_iops.c
+++ b/kernel/fs/xfs/xfs_iops.c
@@ -41,7 +41,6 @@
#include <linux/capability.h>
#include <linux/xattr.h>
-#include <linux/namei.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
#include <linux/fiemap.h>
@@ -414,10 +413,10 @@ xfs_vn_rename(
* we need to be very careful about how much stack we use.
* uio is kmalloced for this reason...
*/
-STATIC void *
+STATIC const char *
xfs_vn_follow_link(
struct dentry *dentry,
- struct nameidata *nd)
+ void **cookie)
{
char *link;
int error = -ENOMEM;
@@ -430,14 +429,12 @@ xfs_vn_follow_link(
if (unlikely(error))
goto out_kfree;
- nd_set_link(nd, link);
- return NULL;
+ return *cookie = link;
out_kfree:
kfree(link);
out_err:
- nd_set_link(nd, ERR_PTR(error));
- return NULL;
+ return ERR_PTR(error);
}
STATIC int
@@ -612,7 +609,7 @@ xfs_setattr_nonsize(
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error)
- goto out_dqrele;
+ goto out_trans_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -643,7 +640,7 @@ xfs_setattr_nonsize(
NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
- goto out_trans_cancel;
+ goto out_unlock;
}
}
@@ -698,11 +695,11 @@ xfs_setattr_nonsize(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -732,10 +729,10 @@ xfs_setattr_nonsize(
return 0;
-out_trans_cancel:
- xfs_trans_cancel(tp, 0);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_dqrele:
+out_trans_cancel:
+ xfs_trans_cancel(tp);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
return error;
@@ -755,7 +752,6 @@ xfs_setattr_size(
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
- uint commit_flags = 0;
bool did_zeroing = false;
trace_xfs_setattr(ip);
@@ -851,7 +847,11 @@ xfs_setattr_size(
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
*/
- error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+ if (IS_DAX(inode))
+ error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
+ else
+ error = block_truncate_page(inode->i_mapping, newsize,
+ xfs_get_blocks);
if (error)
return error;
truncate_setsize(inode, newsize);
@@ -861,7 +861,6 @@ xfs_setattr_size(
if (error)
goto out_trans_cancel;
- commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -901,7 +900,7 @@ xfs_setattr_size(
if (newsize <= oldsize) {
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
if (error)
- goto out_trans_abort;
+ goto out_trans_cancel;
/*
* Truncated "down", so we're removing references to old data
@@ -923,21 +922,19 @@ xfs_setattr_size(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
out_unlock:
if (lock_flags)
xfs_iunlock(ip, lock_flags);
return error;
-out_trans_abort:
- commit_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, commit_flags);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -984,7 +981,7 @@ xfs_vn_update_time(
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1006,7 +1003,7 @@ xfs_vn_update_time(
}
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1191,22 +1188,22 @@ xfs_diflags_to_iflags(
struct inode *inode,
struct xfs_inode *ip)
{
- if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+ uint16_t flags = ip->i_d.di_flags;
+
+ inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
+ S_NOATIME | S_DAX);
+
+ if (flags & XFS_DIFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+ if (flags & XFS_DIFLAG_APPEND)
inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+ if (flags & XFS_DIFLAG_SYNC)
inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+ if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
+ /* XXX: Also needs an on-disk per inode flag! */
+ if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+ inode->i_flags |= S_DAX;
}
/*
diff --git a/kernel/fs/xfs/xfs_itable.c b/kernel/fs/xfs/xfs_itable.c
index 80429891d..930ebd86b 100644
--- a/kernel/fs/xfs/xfs_itable.c
+++ b/kernel/fs/xfs/xfs_itable.c
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
}
irec->ir_free |= xfs_inobt_maskn(0, idx);
- *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+ *icount = irec->ir_count - irec->ir_freecount;
}
return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
goto del_cursor;
if (icount) {
irbp->ir_startino = r.ir_startino;
+ irbp->ir_holemask = r.ir_holemask;
+ irbp->ir_count = r.ir_count;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
* If this chunk has any allocated inodes, save it.
* Also start read-ahead now for this chunk.
*/
- if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ if (r.ir_freecount < r.ir_count) {
xfs_bulkstat_ichunk_ra(mp, agno, &r);
irbp->ir_startino = r.ir_startino;
+ irbp->ir_holemask = r.ir_holemask;
+ irbp->ir_count = r.ir_count;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
- icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+ icount += r.ir_count - r.ir_freecount;
}
error = xfs_btree_increment(cur, 0, &stat);
if (error || stat == 0) {
@@ -469,7 +473,8 @@ xfs_bulkstat(
* pending error, then we are done.
*/
del_cursor:
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, error ?
+ XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_buf_relse(agbp);
if (error)
break;
@@ -599,8 +604,7 @@ xfs_inumbers(
agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
buffer[bufidx].xi_startino =
XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
- buffer[bufidx].xi_alloccount =
- XFS_INODES_PER_CHUNK - r.ir_freecount;
+ buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
buffer[bufidx].xi_allocmask = ~r.ir_free;
if (++bufidx == bcount) {
long written;
diff --git a/kernel/fs/xfs/xfs_linux.h b/kernel/fs/xfs/xfs_linux.h
index 7c7842c85..ec0e239a0 100644
--- a/kernel/fs/xfs/xfs_linux.h
+++ b/kernel/fs/xfs/xfs_linux.h
@@ -32,26 +32,12 @@ typedef unsigned int __uint32_t;
typedef signed long long int __int64_t;
typedef unsigned long long int __uint64_t;
-typedef __uint32_t inst_t; /* an instruction */
-
typedef __s64 xfs_off_t; /* <file offset> type */
typedef unsigned long long xfs_ino_t; /* <inode> type */
typedef __s64 xfs_daddr_t; /* <disk address> type */
-typedef char * xfs_caddr_t; /* <core address> type */
typedef __u32 xfs_dev_t;
typedef __u32 xfs_nlink_t;
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
#include "xfs_types.h"
#include "kmem.h"
@@ -185,6 +171,13 @@ struct xfs_kobj {
struct completion complete;
};
+struct xstats {
+ struct xfsstats __percpu *xs_stats;
+ struct xfs_kobj xs_kobj;
+};
+
+extern struct xstats xfsstats;
+
/* Kernel uid/gid conversion. These are used to convert to/from the on disk
* uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
* The conversion here is type only, the value will remain the same since we
diff --git a/kernel/fs/xfs/xfs_log.c b/kernel/fs/xfs/xfs_log.c
index bcc7cfabb..f52c72a1a 100644
--- a/kernel/fs/xfs/xfs_log.c
+++ b/kernel/fs/xfs/xfs_log.c
@@ -109,7 +109,7 @@ xlog_ungrant_log_space(
STATIC void
xlog_verify_dest_ptr(
struct xlog *log,
- char *ptr);
+ void *ptr);
STATIC void
xlog_verify_grant_tail(
struct xlog *log);
@@ -268,7 +268,7 @@ xlog_grant_head_wait(
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&head->lock);
- XFS_STATS_INC(xs_sleep_logspace);
+ XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
trace_xfs_log_grant_sleep(log, tic);
schedule();
@@ -379,7 +379,7 @@ xfs_log_regrant(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
/*
* This is a new transaction on the ticket, so we need to change the
@@ -448,7 +448,7 @@ xfs_log_reserve(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
@@ -513,7 +513,7 @@ xfs_log_done(
struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- uint flags)
+ bool regrant)
{
struct xlog *log = mp->m_log;
xfs_lsn_t lsn = 0;
@@ -526,14 +526,11 @@ xfs_log_done(
(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
(xlog_commit_record(log, ticket, iclog, &lsn)))) {
lsn = (xfs_lsn_t) -1;
- if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
- flags |= XFS_LOG_REL_PERM_RESERV;
- }
+ regrant = false;
}
- if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
- (flags & XFS_LOG_REL_PERM_RESERV)) {
+ if (!regrant) {
trace_xfs_log_done_nonperm(log, ticket);
/*
@@ -541,7 +538,6 @@ xfs_log_done(
* request has been made to release a permanent reservation.
*/
xlog_ungrant_log_space(log, ticket);
- xfs_log_ticket_put(ticket);
} else {
trace_xfs_log_done_perm(log, ticket);
@@ -553,6 +549,7 @@ xfs_log_done(
ticket->t_flags |= XLOG_TIC_INITED;
}
+ xfs_log_ticket_put(ticket);
return lsn;
}
@@ -671,9 +668,9 @@ xfs_log_mount(
ASSERT(0);
goto out_free_log;
}
+ xfs_crit(mp, "Log size out of supported range.");
xfs_crit(mp,
-"Log size out of supported range. Continuing onwards, but if log hangs are\n"
-"experienced then please report this message in the bug report.");
+"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
}
/*
@@ -703,6 +700,7 @@ xfs_log_mount(
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
+ xlog_recover_cancel(mp->m_log);
goto out_destroy_ail;
}
}
@@ -743,18 +741,35 @@ out:
* it.
*/
int
-xfs_log_mount_finish(xfs_mount_t *mp)
+xfs_log_mount_finish(
+ struct xfs_mount *mp)
{
int error = 0;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
- error = xlog_recover_finish(mp->m_log);
- if (!error)
- xfs_log_work_queue(mp);
- } else {
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ return 0;
}
+ error = xlog_recover_finish(mp->m_log);
+ if (!error)
+ xfs_log_work_queue(mp);
+
+ return error;
+}
+
+/*
+ * The mount has failed. Cancel the recovery if it hasn't completed and destroy
+ * the log.
+ */
+int
+xfs_log_mount_cancel(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = xlog_recover_cancel(mp->m_log);
+ xfs_log_unmount(mp);
return error;
}
@@ -1145,11 +1160,13 @@ xlog_space_left(
* In this case we just want to return the size of the
* log as the amount of space left.
*/
+ xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
xfs_alert(log->l_mp,
- "xlog_space_left: head behind tail\n"
- " tail_cycle = %d, tail_bytes = %d\n"
- " GH cycle = %d, GH bytes = %d",
- tail_cycle, tail_bytes, head_cycle, head_bytes);
+ " tail_cycle = %d, tail_bytes = %d",
+ tail_cycle, tail_bytes);
+ xfs_alert(log->l_mp,
+ " GH cycle = %d, GH bytes = %d",
+ head_cycle, head_bytes);
ASSERT(0);
free_bytes = log->l_logsize;
}
@@ -1447,7 +1464,7 @@ xlog_alloc_log(
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
#ifdef DEBUG
- log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+ log->l_iclog_bak[i] = &iclog->ic_header;
#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
@@ -1602,7 +1619,7 @@ xlog_pack_data(
int i, j, k;
int size = iclog->ic_offset + roundoff;
__be32 cycle_lsn;
- xfs_caddr_t dp;
+ char *dp;
cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
@@ -1655,8 +1672,13 @@ xlog_cksum(
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
int i;
+ int xheads;
+
+ xheads = size / XLOG_HEADER_CYCLE_SIZE;
+ if (size % XLOG_HEADER_CYCLE_SIZE)
+ xheads++;
- for (i = 1; i < log->l_iclog_heads; i++) {
+ for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
sizeof(struct xlog_rec_ext_header));
}
@@ -1746,7 +1768,7 @@ xlog_sync(
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
int size;
- XFS_STATS_INC(xs_log_writes);
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/* Add for LR header */
@@ -1783,7 +1805,7 @@ xlog_sync(
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
- XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+ XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
@@ -2031,26 +2053,24 @@ xlog_print_tic_res(
"SWAPEXT"
};
- xfs_warn(mp,
- "xlog_write: reservation summary:\n"
- " trans type = %s (%u)\n"
- " unit res = %d bytes\n"
- " current res = %d bytes\n"
- " total reg = %u bytes (o/flow = %u bytes)\n"
- " ophdrs = %u (ophdr space = %u bytes)\n"
- " ophdr + reg = %u bytes\n"
- " num regions = %u",
- ((ticket->t_trans_type <= 0 ||
- ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+ xfs_warn(mp, "xlog_write: reservation summary:");
+ xfs_warn(mp, " trans type = %s (%u)",
+ ((ticket->t_trans_type <= 0 ||
+ ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
"bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
- ticket->t_trans_type,
- ticket->t_unit_res,
- ticket->t_curr_res,
- ticket->t_res_arr_sum, ticket->t_res_o_flow,
- ticket->t_res_num_ophdrs, ophdr_spc,
- ticket->t_res_arr_sum +
- ticket->t_res_o_flow + ophdr_spc,
- ticket->t_res_num);
+ ticket->t_trans_type);
+ xfs_warn(mp, " unit res = %d bytes",
+ ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes",
+ ticket->t_curr_res);
+ xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
+ ticket->t_res_arr_sum, ticket->t_res_o_flow);
+ xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
+ ticket->t_res_num_ophdrs, ophdr_spc);
+ xfs_warn(mp, " ophdr + reg = %u bytes",
+ ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
+ xfs_warn(mp, " num regions = %u",
+ ticket->t_res_num);
for (i = 0; i < ticket->t_res_num; i++) {
uint r_type = ticket->t_res_arr[i].r_type;
@@ -2402,11 +2422,20 @@ xlog_write(
&partial_copy_len);
xlog_verify_dest_ptr(log, ptr);
- /* copy region */
+ /*
+ * Copy region.
+ *
+ * Unmount records just log an opheader, so can have
+ * empty payloads with no data region to copy. Hence we
+ * only copy the payload if the vector says it has data
+ * to copy.
+ */
ASSERT(copy_len >= 0);
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
-
+ if (copy_len > 0) {
+ memcpy(ptr, reg->i_addr + copy_off, copy_len);
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ copy_len);
+ }
copy_len += start_rec_copy + sizeof(xlog_op_header_t);
record_cnt++;
data_cnt += contwr ? copy_len : 0;
@@ -2893,7 +2922,7 @@ restart:
iclog = log->l_iclog;
if (iclog->ic_state != XLOG_STATE_ACTIVE) {
- XFS_STATS_INC(xs_log_noiclogs);
+ XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
/* Wait for log writes to have flushed */
xlog_wait(&log->l_flush_wait, &log->l_icloglock);
@@ -3145,11 +3174,19 @@ xlog_state_switch_iclogs(
}
if (log->l_curr_block >= log->l_logBBsize) {
+ /*
+ * Rewind the current block before the cycle is bumped to make
+ * sure that the combined LSN never transiently moves forward
+ * when the log wraps to the next cycle. This is to support the
+ * unlocked sample of these fields from xlog_valid_lsn(). Most
+ * other cases should acquire l_icloglock.
+ */
+ log->l_curr_block -= log->l_logBBsize;
+ ASSERT(log->l_curr_block >= 0);
+ smp_wmb();
log->l_curr_cycle++;
if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
log->l_curr_cycle++;
- log->l_curr_block -= log->l_logBBsize;
- ASSERT(log->l_curr_block >= 0);
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
@@ -3192,7 +3229,7 @@ _xfs_log_force(
struct xlog_in_core *iclog;
xfs_lsn_t lsn;
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
xlog_cil_force(log);
@@ -3277,7 +3314,7 @@ maybe_sleep:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -3342,7 +3379,7 @@ _xfs_log_force_lsn(
ASSERT(lsn != 0);
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
lsn = xlog_cil_force_lsn(log, lsn);
if (lsn == NULLCOMMITLSN)
@@ -3391,7 +3428,7 @@ try_again:
(XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
@@ -3421,7 +3458,7 @@ try_again:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -3664,7 +3701,7 @@ xlog_ticket_alloc(
void
xlog_verify_dest_ptr(
struct xlog *log,
- char *ptr)
+ void *ptr)
{
int i;
int good_ptr = 0;
@@ -3767,9 +3804,8 @@ xlog_verify_iclog(
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
xlog_in_core_2_t *xhdr;
- xfs_caddr_t ptr;
- xfs_caddr_t base_ptr;
- __psint_t field_offset;
+ void *base_ptr, *ptr, *p;
+ ptrdiff_t field_offset;
__uint8_t clientid;
int len, i, j, k, op_len;
int idx;
@@ -3788,9 +3824,9 @@ xlog_verify_iclog(
if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
- ptr = (xfs_caddr_t) &iclog->ic_header;
- for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
- ptr += BBSIZE) {
+ base_ptr = ptr = &iclog->ic_header;
+ p = &iclog->ic_header;
+ for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: unexpected magic num",
__func__);
@@ -3798,20 +3834,19 @@ xlog_verify_iclog(
/* check fields */
len = be32_to_cpu(iclog->ic_header.h_num_logops);
- ptr = iclog->ic_datap;
- base_ptr = ptr;
- ophead = (xlog_op_header_t *)ptr;
+ base_ptr = ptr = iclog->ic_datap;
+ ophead = ptr;
xhdr = iclog->ic_data;
for (i = 0; i < len; i++) {
- ophead = (xlog_op_header_t *)ptr;
+ ophead = ptr;
/* clientid is only 1 byte */
- field_offset = (__psint_t)
- ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+ p = &ophead->oh_clientid;
+ field_offset = p - base_ptr;
if (!syncing || (field_offset & 0x1ff)) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+ idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,13 +3864,13 @@ xlog_verify_iclog(
(unsigned long)field_offset);
/* check length */
- field_offset = (__psint_t)
- ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+ p = &ophead->oh_len;
+ field_offset = p - base_ptr;
if (!syncing || (field_offset & 0x1ff)) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((__psint_t)&ophead->oh_len -
- (__psint_t)iclog->ic_datap);
+ idx = BTOBBT((uintptr_t)&ophead->oh_len -
+ (uintptr_t)iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -4005,3 +4040,45 @@ xlog_iclogs_empty(
return 1;
}
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+ struct xfs_mount *mp,
+ xfs_lsn_t lsn)
+{
+ struct xlog *log = mp->m_log;
+ bool valid;
+
+ /*
+ * norecovery mode skips mount-time log processing and unconditionally
+ * resets the in-core LSN. We can't validate in this mode, but
+ * modifications are not allowed anyways so just return true.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return true;
+
+ /*
+ * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+ * handled by recovery and thus safe to ignore here.
+ */
+ if (lsn == NULLCOMMITLSN)
+ return true;
+
+ valid = xlog_valid_lsn(mp->m_log, lsn);
+
+ /* warn the user about what's gone wrong before verifier failure */
+ if (!valid) {
+ spin_lock(&log->l_icloglock);
+ xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+ CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+ log->l_curr_cycle, log->l_curr_block);
+ spin_unlock(&log->l_icloglock);
+ }
+
+ return valid;
+}
diff --git a/kernel/fs/xfs/xfs_log.h b/kernel/fs/xfs/xfs_log.h
index 84e0deb95..aa533a7d5 100644
--- a/kernel/fs/xfs/xfs_log.h
+++ b/kernel/fs/xfs/xfs_log.h
@@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
/*
- * Macros, structures, prototypes for interface to the log manager.
- */
-
-/*
- * Flags to xfs_log_done()
- */
-#define XFS_LOG_REL_PERM_RESERV 0x1
-
-/*
* Flags to xfs_log_force()
*
* XFS_LOG_SYNC: Synchronous force in-core log to disk
@@ -138,7 +129,7 @@ struct xfs_log_callback;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- uint flags);
+ bool regrant);
int _xfs_log_force(struct xfs_mount *mp,
uint flags,
int *log_forced);
@@ -156,6 +147,7 @@ int xfs_log_mount(struct xfs_mount *mp,
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
+int xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
@@ -183,11 +175,12 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn, int flags);
+ xfs_lsn_t *commit_lsn, bool regrant);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
+bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
#endif /* __XFS_LOG_H__ */
diff --git a/kernel/fs/xfs/xfs_log_cil.c b/kernel/fs/xfs/xfs_log_cil.c
index 45cc0ce18..4e7649351 100644
--- a/kernel/fs/xfs/xfs_log_cil.c
+++ b/kernel/fs/xfs/xfs_log_cil.c
@@ -307,7 +307,13 @@ xlog_cil_insert_items(
if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- list_move_tail(&lip->li_cil, &cil->xc_cil);
+ /*
+ * Only move the item if it isn't already at the tail. This is
+ * to prevent a transient list_empty() state when reinserting
+ * an item that is already the only item in the CIL.
+ */
+ if (!list_is_last(&lip->li_cil, &cil->xc_cil))
+ list_move_tail(&lip->li_cil, &cil->xc_cil);
}
/* account for space used by new iovec headers */
@@ -624,7 +630,7 @@ restart:
spin_unlock(&cil->xc_push_lock);
/* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
if (commit_lsn == -1)
goto out_abort;
@@ -773,14 +779,10 @@ xfs_log_commit_cil(
struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_lsn_t *commit_lsn,
- int flags)
+ bool regrant)
{
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
- int log_flags = 0;
-
- if (flags & XFS_TRANS_RELEASE_LOG_RES)
- log_flags = XFS_LOG_REL_PERM_RESERV;
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
@@ -795,7 +797,7 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = tp->t_commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ xfs_log_done(mp, tp->t_ticket, NULL, regrant);
xfs_trans_unreserve_and_mod_sb(tp);
/*
@@ -809,7 +811,7 @@ xfs_log_commit_cil(
* the log items. This affects (at least) processing of stale buffers,
* inodes and EFIs.
*/
- xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
+ xfs_trans_free_items(tp, tp->t_commit_lsn, false);
xlog_cil_push_background(log);
diff --git a/kernel/fs/xfs/xfs_log_priv.h b/kernel/fs/xfs/xfs_log_priv.h
index db7cbdeb2..8daba7491 100644
--- a/kernel/fs/xfs/xfs_log_priv.h
+++ b/kernel/fs/xfs/xfs_log_priv.h
@@ -409,7 +409,7 @@ struct xlog {
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
- char *l_iclog_bak[XLOG_MAX_ICLOGS];
+ void *l_iclog_bak[XLOG_MAX_ICLOGS];
#endif
};
@@ -426,6 +426,8 @@ xlog_recover(
extern int
xlog_recover_finish(
struct xlog *log);
+extern int
+xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
@@ -558,4 +560,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
remove_wait_queue(wq, &wait);
}
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+ struct xlog *log,
+ xfs_lsn_t lsn)
+{
+ int cur_cycle;
+ int cur_block;
+ bool valid = true;
+
+ /*
+ * First, sample the current lsn without locking to avoid added
+ * contention from metadata I/O. The current cycle and block are updated
+ * (in xlog_state_switch_iclogs()) and read here in a particular order
+ * to avoid false negatives (e.g., thinking the metadata LSN is valid
+ * when it is not).
+ *
+ * The current block is always rewound before the cycle is bumped in
+ * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+ * a transiently forward state. Instead, we can see the LSN in a
+ * transiently behind state if we happen to race with a cycle wrap.
+ */
+ cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+ smp_rmb();
+ cur_block = ACCESS_ONCE(log->l_curr_block);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+ /*
+ * If the metadata LSN appears invalid, it's possible the check
+ * above raced with a wrap to the next log cycle. Grab the lock
+ * to check for sure.
+ */
+ spin_lock(&log->l_icloglock);
+ cur_cycle = log->l_curr_cycle;
+ cur_block = log->l_curr_block;
+ spin_unlock(&log->l_icloglock);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+ valid = false;
+ }
+
+ return valid;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/kernel/fs/xfs/xfs_log_recover.c b/kernel/fs/xfs/xfs_log_recover.c
index a5d03396d..c5ecaacdd 100644
--- a/kernel/fs/xfs/xfs_log_recover.c
+++ b/kernel/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_put_bp(
* Return the address of the start of the given block number's data
* in a log buffer. The buffer covers a log sector-aligned region.
*/
-STATIC xfs_caddr_t
+STATIC char *
xlog_align(
struct xlog *log,
xfs_daddr_t blk_no,
@@ -203,7 +203,7 @@ xlog_bread(
xfs_daddr_t blk_no,
int nbblks,
struct xfs_buf *bp,
- xfs_caddr_t *offset)
+ char **offset)
{
int error;
@@ -225,9 +225,9 @@ xlog_bread_offset(
xfs_daddr_t blk_no, /* block to read from */
int nbblks, /* blocks to read */
struct xfs_buf *bp,
- xfs_caddr_t offset)
+ char *offset)
{
- xfs_caddr_t orig_offset = bp->b_addr;
+ char *orig_offset = bp->b_addr;
int orig_len = BBTOB(bp->b_length);
int error, error2;
@@ -396,7 +396,7 @@ xlog_find_cycle_start(
xfs_daddr_t *last_blk,
uint cycle)
{
- xfs_caddr_t offset;
+ char *offset;
xfs_daddr_t mid_blk;
xfs_daddr_t end_blk;
uint mid_cycle;
@@ -443,7 +443,7 @@ xlog_find_verify_cycle(
uint cycle;
xfs_buf_t *bp;
xfs_daddr_t bufblks;
- xfs_caddr_t buf = NULL;
+ char *buf = NULL;
int error = 0;
/*
@@ -509,7 +509,7 @@ xlog_find_verify_log_record(
{
xfs_daddr_t i;
xfs_buf_t *bp;
- xfs_caddr_t offset = NULL;
+ char *offset = NULL;
xlog_rec_header_t *head = NULL;
int error = 0;
int smallmem = 0;
@@ -616,7 +616,7 @@ xlog_find_head(
xfs_daddr_t *return_head_blk)
{
xfs_buf_t *bp;
- xfs_caddr_t offset;
+ char *offset;
xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
int num_scan_bblks;
uint first_half_cycle, last_half_cycle;
@@ -891,7 +891,7 @@ xlog_find_tail(
{
xlog_rec_header_t *rhead;
xlog_op_header_t *op_head;
- xfs_caddr_t offset = NULL;
+ char *offset = NULL;
xfs_buf_t *bp;
int error, i, found;
xfs_daddr_t umount_data_blk;
@@ -1099,7 +1099,7 @@ xlog_find_zeroed(
xfs_daddr_t *blk_no)
{
xfs_buf_t *bp;
- xfs_caddr_t offset;
+ char *offset;
uint first_cycle, last_cycle;
xfs_daddr_t new_blk, last_blk, start_blk;
xfs_daddr_t num_scan_bblks;
@@ -1199,7 +1199,7 @@ bp_err:
STATIC void
xlog_add_record(
struct xlog *log,
- xfs_caddr_t buf,
+ char *buf,
int cycle,
int block,
int tail_cycle,
@@ -1227,7 +1227,7 @@ xlog_write_log_records(
int tail_cycle,
int tail_block)
{
- xfs_caddr_t offset;
+ char *offset;
xfs_buf_t *bp;
int balign, ealign;
int sectbb = log->l_sectBBsize;
@@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer(
return -EFSCORRUPTED;
}
- buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
- next_unlinked_offset);
+ buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
*buffer_nextp = *logged_nextp;
/*
@@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer(
* have to leave the inode in a consistent state for whoever
* reads it next....
*/
- xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_dinode_calc_crc(mp,
xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
}
@@ -1896,15 +1895,25 @@ xlog_recover_get_buf_lsn(
*/
goto recover_immediately;
case XFS_SB_MAGIC:
+ /*
+ * superblock uuids are magic. We may or may not have a
+ * sb_meta_uuid on disk, but it will be set in the in-core
+ * superblock. We set the uuid pointer for verification
+ * according to the superblock feature mask to ensure we check
+ * the relevant UUID in the superblock.
+ */
lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
- uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ if (xfs_sb_version_hasmetauuid(&mp->m_sb))
+ uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
+ else
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
break;
default:
break;
}
if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
goto recover_immediately;
return lsn;
}
@@ -2508,8 +2517,8 @@ xlog_recover_inode_pass2(
xfs_buf_t *bp;
xfs_dinode_t *dip;
int len;
- xfs_caddr_t src;
- xfs_caddr_t dest;
+ char *src;
+ char *dest;
int error;
int attr_index;
uint fields;
@@ -2551,7 +2560,7 @@ xlog_recover_inode_pass2(
goto out_release;
}
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+ dip = xfs_buf_offset(bp, in_f->ilf_boffset);
/*
* Make sure the place we're flushing out to really looks
@@ -2890,7 +2899,7 @@ xlog_recover_dquot_pass2(
return error;
ASSERT(bp);
- ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+ ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
/*
* If the dquot has an LSN in it, recover the dquot only if it's less
@@ -2934,16 +2943,16 @@ xlog_recover_efi_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
- xfs_mount_t *mp = log->l_mp;
- xfs_efi_log_item_t *efip;
- xfs_efi_log_format_t *efi_formatp;
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_efi_log_format *efi_formatp;
efi_formatp = item->ri_buf[0].i_addr;
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
- if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
- &(efip->efi_format)))) {
+ error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+ if (error) {
xfs_efi_item_free(efip);
return error;
}
@@ -2951,20 +2960,23 @@ xlog_recover_efi_pass2(
spin_lock(&log->l_ailp->xa_lock);
/*
- * xfs_trans_ail_update() drops the AIL lock.
+ * The EFI has two references. One for the EFD and one for EFI to ensure
+ * it makes it into the AIL. Insert the EFI into the AIL directly and
+ * drop the EFI reference. Note that xfs_trans_ail_update() drops the
+ * AIL lock.
*/
xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+ xfs_efi_release(efip);
return 0;
}
/*
- * This routine is called when an efd format structure is found in
- * a committed transaction in the log. It's purpose is to cancel
- * the corresponding efi if it was still in the log. To do this
- * it searches the AIL for the efi with an id equal to that in the
- * efd format structure. If we find it, we remove the efi from the
- * AIL and free it.
+ * This routine is called when an EFD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding EFI if it
+ * was still in the log. To do this it searches the AIL for the EFI with an id
+ * equal to that in the EFD format structure. If we find it we drop the EFD
+ * reference, which removes the EFI from the AIL and frees it.
*/
STATIC int
xlog_recover_efd_pass2(
@@ -2986,8 +2998,8 @@ xlog_recover_efd_pass2(
efi_id = efd_formatp->efd_efi_id;
/*
- * Search for the efi with the id in the efd format structure
- * in the AIL.
+ * Search for the EFI with the id in the EFD format structure in the
+ * AIL.
*/
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
@@ -2996,18 +3008,18 @@ xlog_recover_efd_pass2(
efip = (xfs_efi_log_item_t *)lip;
if (efip->efi_format.efi_id == efi_id) {
/*
- * xfs_trans_ail_delete() drops the
- * AIL lock.
+ * Drop the EFD reference to the EFI. This
+ * removes the EFI from the AIL and frees it.
*/
- xfs_trans_ail_delete(ailp, lip,
- SHUTDOWN_CORRUPT_INCORE);
- xfs_efi_item_free(efip);
+ spin_unlock(&ailp->xa_lock);
+ xfs_efi_release(efip);
spin_lock(&ailp->xa_lock);
break;
}
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
@@ -3035,6 +3047,11 @@ xlog_recover_do_icreate_pass2(
unsigned int count;
unsigned int isize;
xfs_agblock_t length;
+ int blks_per_cluster;
+ int bb_per_cluster;
+ int cancel_count;
+ int nbufs;
+ int i;
icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
if (icl->icl_type != XFS_LI_ICREATE) {
@@ -3073,32 +3090,65 @@ xlog_recover_do_icreate_pass2(
return -EINVAL;
}
- /* existing allocation is fixed value */
- ASSERT(count == mp->m_ialloc_inos);
- ASSERT(length == mp->m_ialloc_blks);
- if (count != mp->m_ialloc_inos ||
- length != mp->m_ialloc_blks) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ /*
+ * The inode chunk is either full or sparse and we only support
+ * m_ialloc_min_blks sized sparse allocations at this time.
+ */
+ if (length != mp->m_ialloc_blks &&
+ length != mp->m_ialloc_min_blks) {
+ xfs_warn(log->l_mp,
+ "%s: unsupported chunk length", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ /* verify inode count is consistent with extent length */
+ if ((count >> mp->m_sb.sb_inopblog) != length) {
+ xfs_warn(log->l_mp,
+ "%s: inconsistent inode count and chunk length",
+ __FUNCTION__);
return -EINVAL;
}
/*
- * Inode buffers can be freed. Do not replay the inode initialisation as
- * we could be overwriting something written after this inode buffer was
- * cancelled.
+ * The icreate transaction can cover multiple cluster buffers and these
+ * buffers could have been freed and reused. Check the individual
+ * buffers for cancellation so we don't overwrite anything written after
+ * a cancellation.
+ */
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ nbufs = length / blks_per_cluster;
+ for (i = 0, cancel_count = 0; i < nbufs; i++) {
+ xfs_daddr_t daddr;
+
+ daddr = XFS_AGB_TO_DADDR(mp, agno,
+ agbno + i * blks_per_cluster);
+ if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
+ cancel_count++;
+ }
+
+ /*
+ * We currently only use icreate for a single allocation at a time. This
+ * means we should expect either all or none of the buffers to be
+ * cancelled. Be conservative and skip replay if at least one buffer is
+ * cancelled, but warn the user that something is awry if the buffers
+ * are not consistent.
*
- * XXX: we need to iterate all buffers and only init those that are not
- * cancelled. I think that a more fine grained factoring of
- * xfs_ialloc_inode_init may be appropriate here to enable this to be
- * done easily.
+ * XXX: This must be refined to only skip cancelled clusters once we use
+ * icreate for multiple chunk allocations.
*/
- if (xlog_check_buffer_cancelled(log,
- XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ ASSERT(!cancel_count || cancel_count == nbufs);
+ if (cancel_count) {
+ if (cancel_count != nbufs)
+ xfs_warn(mp,
+ "WARNING: partial inode chunk cancellation, skipped icreate.");
+ trace_xfs_log_recover_icreate_cancel(log, icl);
return 0;
+ }
- xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
- be32_to_cpu(icl->icl_gen));
- return 0;
+ trace_xfs_log_recover_icreate_recover(log, icl);
+ return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
+ length, be32_to_cpu(icl->icl_gen));
}
STATIC void
@@ -3369,21 +3419,31 @@ STATIC int
xlog_recover_add_to_cont_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
int len)
{
xlog_recover_item_t *item;
- xfs_caddr_t ptr, old_ptr;
+ char *ptr, *old_ptr;
int old_len;
+ /*
+ * If the transaction is empty, the header was split across this and the
+ * previous record. Copy the rest of the header.
+ */
if (list_empty(&trans->r_itemq)) {
- /* finish copying rest of trans header */
+ ASSERT(len <= sizeof(struct xfs_trans_header));
+ if (len > sizeof(struct xfs_trans_header)) {
+ xfs_warn(log->l_mp, "%s: bad header length", __func__);
+ return -EIO;
+ }
+
xlog_recover_add_item(&trans->r_itemq);
- ptr = (xfs_caddr_t) &trans->r_theader +
- sizeof(xfs_trans_header_t) - len;
+ ptr = (char *)&trans->r_theader +
+ sizeof(struct xfs_trans_header) - len;
memcpy(ptr, dp, len);
return 0;
}
+
/* take the tail entry */
item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
@@ -3415,12 +3475,12 @@ STATIC int
xlog_recover_add_to_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
int len)
{
xfs_inode_log_format_t *in_f; /* any will do */
xlog_recover_item_t *item;
- xfs_caddr_t ptr;
+ char *ptr;
if (!len)
return 0;
@@ -3432,7 +3492,19 @@ xlog_recover_add_to_trans(
ASSERT(0);
return -EIO;
}
- if (len == sizeof(xfs_trans_header_t))
+
+ if (len > sizeof(struct xfs_trans_header)) {
+ xfs_warn(log->l_mp, "%s: bad header length", __func__);
+ ASSERT(0);
+ return -EIO;
+ }
+
+ /*
+ * The transaction header can be arbitrarily split across op
+ * records. If we don't have the whole thing here, copy what we
+ * do have and handle the rest in the next record.
+ */
+ if (len == sizeof(struct xfs_trans_header))
xlog_recover_add_item(&trans->r_itemq);
memcpy(&trans->r_theader, dp, len);
return 0;
@@ -3509,7 +3581,7 @@ STATIC int
xlog_recovery_process_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
unsigned int len,
unsigned int flags,
int pass)
@@ -3616,8 +3688,8 @@ xlog_recover_process_ophdr(
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
struct xlog_op_header *ohead,
- xfs_caddr_t dp,
- xfs_caddr_t end,
+ char *dp,
+ char *end,
int pass)
{
struct xlog_recover *trans;
@@ -3666,11 +3738,11 @@ xlog_recover_process_data(
struct xlog *log,
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
int pass)
{
struct xlog_op_header *ohead;
- xfs_caddr_t end;
+ char *end;
int num_logops;
int error;
@@ -3735,7 +3807,7 @@ xlog_recover_process_efi(
* free the memory associated with it.
*/
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- xfs_efi_release(efip, efip->efi_format.efi_nextents);
+ xfs_efi_release(efip);
return -EIO;
}
}
@@ -3748,19 +3820,19 @@ xlog_recover_process_efi(
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &(efip->efi_format.efi_extents[i]);
- error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+ error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+ extp->ext_len);
if (error)
goto abort_error;
- xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
- extp->ext_len);
+
}
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
return error;
abort_error:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
@@ -3784,10 +3856,10 @@ abort_error:
*/
STATIC int
xlog_recover_process_efis(
- struct xlog *log)
+ struct xlog *log)
{
- xfs_log_item_t *lip;
- xfs_efi_log_item_t *efip;
+ struct xfs_log_item *lip;
+ struct xfs_efi_log_item *efip;
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
@@ -3811,7 +3883,7 @@ xlog_recover_process_efis(
/*
* Skip EFIs that we've already processed.
*/
- efip = (xfs_efi_log_item_t *)lip;
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
lip = xfs_trans_ail_cursor_next(ailp, &cur);
continue;
@@ -3831,6 +3903,50 @@ out:
}
/*
+ * A cancel occurs when the mount has failed and we're bailing out. Release all
+ * pending EFIs so they don't pin the AIL.
+ */
+STATIC int
+xlog_recover_cancel_efis(
+ struct xlog *log)
+{
+ struct xfs_log_item *lip;
+ struct xfs_efi_log_item *efip;
+ int error = 0;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp;
+
+ ailp = log->l_ailp;
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ /*
+ * We're done when we see something other than an EFI.
+ * There should be no EFIs left in the AIL now.
+ */
+ if (lip->li_type != XFS_LI_EFI) {
+#ifdef DEBUG
+ for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+ ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
+ break;
+ }
+
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+
+ spin_unlock(&ailp->xa_lock);
+ xfs_efi_release(efip);
+ spin_lock(&ailp->xa_lock);
+
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->xa_lock);
+ return error;
+}
+
+/*
* This routine performs a transaction to null out a bad inode pointer
* in an agi unlinked inode hash bucket.
*/
@@ -3862,13 +3978,13 @@ xlog_recover_clear_agi_bucket(
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto out_error;
return;
out_abort:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
out_error:
xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
return;
@@ -4015,7 +4131,7 @@ xlog_recover_process_iunlinks(
STATIC int
xlog_unpack_data_crc(
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
struct xlog *log)
{
__le32 crc;
@@ -4045,7 +4161,7 @@ xlog_unpack_data_crc(
STATIC int
xlog_unpack_data(
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
struct xlog *log)
{
int i, j, k;
@@ -4127,7 +4243,7 @@ xlog_do_recovery_pass(
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
- xfs_caddr_t offset;
+ char *offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size;
int bblks, split_bblks;
@@ -4493,9 +4609,19 @@ xlog_recover(
int error;
/* find the tail of the log */
- if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+ error = xlog_find_tail(log, &head_blk, &tail_blk);
+ if (error)
return error;
+ /*
+ * The superblock was read before the log was available and thus the LSN
+ * could not be verified. Check the superblock LSN against the current
+ * LSN now that it's known.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+ !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+ return -EINVAL;
+
if (tail_blk != head_blk) {
/* There used to be a comment here:
*
@@ -4523,11 +4649,13 @@ xlog_recover(
xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
xfs_warn(log->l_mp,
-"Superblock has unknown incompatible log features (0x%x) enabled.\n"
-"The log can not be fully and/or safely recovered by this kernel.\n"
-"Please recover the log on a kernel that supports the unknown features.",
+"Superblock has unknown incompatible log features (0x%x) enabled.",
(log->l_mp->m_sb.sb_features_log_incompat &
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ xfs_warn(log->l_mp,
+"The log can not be fully and/or safely recovered by this kernel.");
+ xfs_warn(log->l_mp,
+"Please recover the log on a kernel that supports the unknown features.");
return -EINVAL;
}
@@ -4603,6 +4731,17 @@ xlog_recover_finish(
return 0;
}
+int
+xlog_recover_cancel(
+ struct xlog *log)
+{
+ int error = 0;
+
+ if (log->l_flags & XLOG_RECOVERY_NEEDED)
+ error = xlog_recover_cancel_efis(log);
+
+ return error;
+}
#if defined(DEBUG)
/*
diff --git a/kernel/fs/xfs/xfs_message.c b/kernel/fs/xfs/xfs_message.c
index d8b67547a..11792d888 100644
--- a/kernel/fs/xfs/xfs_message.c
+++ b/kernel/fs/xfs/xfs_message.c
@@ -17,6 +17,7 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_error.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
+ int level; \
\
va_start(args, fmt); \
\
@@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
\
__xfs_printk(kern_level, mp, &vaf); \
va_end(args); \
+ \
+ if (!kstrtoint(kern_level, 0, &level) && \
+ level <= LOGLEVEL_ERR && \
+ xfs_error_level >= XFS_ERRLEVEL_HIGH) \
+ xfs_stack_trace(); \
} \
define_xfs_printk_level(xfs_emerg, KERN_EMERG);
diff --git a/kernel/fs/xfs/xfs_mount.c b/kernel/fs/xfs/xfs_mount.c
index 6f23fbdfb..bb753b359 100644
--- a/kernel/fs/xfs/xfs_mount.c
+++ b/kernel/fs/xfs/xfs_mount.c
@@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
+void
+xfs_uuid_table_free(void)
+{
+ if (xfs_uuid_table_size == 0)
+ return;
+ kmem_free(xfs_uuid_table);
+ xfs_uuid_table = NULL;
+ xfs_uuid_table_size = 0;
+}
+
/*
* See if the UUID is unique among mounted XFS filesystems.
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -615,14 +625,14 @@ xfs_default_resblks(xfs_mount_t *mp)
*/
int
xfs_mountfs(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_sb_t *sbp = &(mp->m_sb);
- xfs_inode_t *rip;
- __uint64_t resblks;
- uint quotamount = 0;
- uint quotaflags = 0;
- int error = 0;
+ struct xfs_sb *sbp = &(mp->m_sb);
+ struct xfs_inode *rip;
+ __uint64_t resblks;
+ uint quotamount = 0;
+ uint quotaflags = 0;
+ int error = 0;
xfs_sb_mount_common(mp, sbp);
@@ -693,10 +703,15 @@ xfs_mountfs(
if (error)
goto out;
- error = xfs_uuid_mount(mp);
+ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+ &mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_del_stats;
+
/*
* Set the minimum read and write sizes
*/
@@ -725,6 +740,22 @@ xfs_mountfs(
}
/*
+ * If enabled, sparse inode chunk alignment is expected to match the
+ * cluster size. Full inode chunk alignment must match the chunk size,
+ * but that is checked on sb read verification...
+ */
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+ mp->m_sb.sb_spino_align !=
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+ xfs_warn(mp,
+ "Sparse inode block alignment (%u) must match cluster size (%llu).",
+ mp->m_sb.sb_spino_align,
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+ error = -EINVAL;
+ goto out_remove_uuid;
+ }
+
+ /*
* Set inode alignment fields
*/
xfs_set_inoalignment(mp);
@@ -783,7 +814,9 @@ xfs_mountfs(
}
/*
- * log's mount-time initialization. Perform 1st part recovery if needed
+ * Log's mount-time initialization. The first part of recovery can place
+ * some items on the AIL, to be handled when recovery is finished or
+ * cancelled.
*/
error = xfs_log_mount(mp, mp->m_logdev_targp,
XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
@@ -894,9 +927,9 @@ xfs_mountfs(
}
/*
- * Finish recovering the file system. This part needed to be
- * delayed until after the root and real-time bitmap inodes
- * were consistently read in.
+ * Finish recovering the file system. This part needed to be delayed
+ * until after the root and real-time bitmap inodes were consistently
+ * read in.
*/
error = xfs_log_mount_finish(mp);
if (error) {
@@ -939,8 +972,10 @@ xfs_mountfs(
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
out_log_dealloc:
- xfs_log_unmount(mp);
+ xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_wait_buftarg(mp->m_logdev_targp);
@@ -951,6 +986,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_del_stats:
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
@@ -1027,6 +1064,7 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
+
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1036,6 +1074,7 @@ xfs_unmountfs(
#endif
xfs_free_perag(mp);
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
diff --git a/kernel/fs/xfs/xfs_mount.h b/kernel/fs/xfs/xfs_mount.h
index 8c995a2cc..b57098481 100644
--- a/kernel/fs/xfs/xfs_mount.h
+++ b/kernel/fs/xfs/xfs_mount.h
@@ -101,6 +101,8 @@ typedef struct xfs_mount {
__uint64_t m_flags; /* global mount flags */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
+ int m_ialloc_min_blks;/* min blocks in sparse inode
+ * allocation */
int m_inoalign_mask;/* mask sb_inoalignmt if used */
uint m_qflags; /* quota status flags */
struct xfs_trans_resv m_resv; /* precomputed res values */
@@ -125,6 +127,7 @@ typedef struct xfs_mount {
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_data_workqueue;
@@ -179,6 +182,8 @@ typedef struct xfs_mount {
allocator */
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
+#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
+
/*
* Default minimum read and write sizes.
@@ -308,6 +313,7 @@ typedef struct xfs_perag {
int pagb_count; /* pagb slots in use */
} xfs_perag_t;
+extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
@@ -332,4 +338,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
+int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/kernel/fs/xfs/xfs_pnfs.c b/kernel/fs/xfs/xfs_pnfs.c
index 981a657ec..dc6221942 100644
--- a/kernel/fs/xfs/xfs_pnfs.c
+++ b/kernel/fs/xfs/xfs_pnfs.c
@@ -181,6 +181,11 @@ xfs_fs_map_blocks(
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * xfs_iomap_write_direct() expects to take ownership of
+ * the shared ilock.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
error = xfs_iomap_write_direct(ip, offset, length,
&imap, nimaps);
if (error)
@@ -306,7 +311,7 @@ xfs_fs_commit_blocks(
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_drop_iolock;
}
@@ -321,7 +326,7 @@ xfs_fs_commit_blocks(
}
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_drop_iolock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/kernel/fs/xfs/xfs_qm.c b/kernel/fs/xfs/xfs_qm.c
index 5538468c7..532ab79d3 100644
--- a/kernel/fs/xfs/xfs_qm.c
+++ b/kernel/fs/xfs/xfs_qm.c
@@ -184,7 +184,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(mp, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
@@ -448,11 +448,11 @@ xfs_qm_dquot_isolate(
*/
if (dqp->q_nrefs) {
xfs_dqunlock(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
list_lru_isolate(lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
return LRU_REMOVED;
}
@@ -496,19 +496,19 @@ xfs_qm_dquot_isolate(
ASSERT(dqp->q_nrefs == 0);
list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
return LRU_SKIP;
out_unlock_dirty:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
xfs_dqunlock(dqp);
spin_lock(lru_lock);
return LRU_RETRY;
@@ -525,7 +525,7 @@ xfs_qm_shrink_scan(
unsigned long freed;
int error;
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
return 0;
INIT_LIST_HEAD(&isol.buffers);
@@ -756,7 +756,7 @@ xfs_qm_qino_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -764,8 +764,7 @@ xfs_qm_qino_alloc(
error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
&committed);
if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
}
@@ -796,7 +795,7 @@ xfs_qm_qino_alloc(
spin_unlock(&mp->m_sb_lock);
xfs_log_sb(tp);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
diff --git a/kernel/fs/xfs/xfs_qm_syscalls.c b/kernel/fs/xfs/xfs_qm_syscalls.c
index 9a25c9275..3640c6e89 100644
--- a/kernel/fs/xfs/xfs_qm_syscalls.c
+++ b/kernel/fs/xfs/xfs_qm_syscalls.c
@@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile(
tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
goto out_put;
}
@@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile(
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
ASSERT(ip->i_d.di_nextents == 0);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -437,7 +436,7 @@ xfs_qm_scall_setqlim(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_rele;
}
@@ -548,7 +547,7 @@ xfs_qm_scall_setqlim(
dqp->dq_flags |= XFS_DQ_DIRTY;
xfs_trans_log_dquot(tp, dqp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_rele:
xfs_qm_dqrele(dqp);
@@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end(
* We don't care about quotoff's performance.
*/
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
- return error;
+ return xfs_trans_commit(tp);
}
@@ -605,7 +603,7 @@ xfs_qm_log_quotaoff(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out;
}
@@ -624,7 +622,7 @@ xfs_qm_log_quotaoff(
* We don't care about quotoff's performance.
*/
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto out;
diff --git a/kernel/fs/xfs/xfs_quota.h b/kernel/fs/xfs/xfs_quota.h
index 5376dd406..ce6506ada 100644
--- a/kernel/fs/xfs/xfs_quota.h
+++ b/kernel/fs/xfs/xfs_quota.h
@@ -55,7 +55,6 @@ struct xfs_trans;
typedef struct xfs_dqtrx {
struct xfs_dquot *qt_dquot; /* the dquot this refers to */
ulong qt_blk_res; /* blks reserved on a dquot */
- ulong qt_blk_res_used; /* blks used from the reservation */
ulong qt_ino_res; /* inode reserved on a dquot */
ulong qt_ino_res_used; /* inodes used from the reservation */
long qt_bcount_delta; /* dquot blk count changes */
diff --git a/kernel/fs/xfs/xfs_rtalloc.c b/kernel/fs/xfs/xfs_rtalloc.c
index f2079b691..ab1bac6a3 100644
--- a/kernel/fs/xfs/xfs_rtalloc.c
+++ b/kernel/fs/xfs/xfs_rtalloc.c
@@ -757,32 +757,30 @@ xfs_rtallocate_extent_size(
/*
* Allocate space to the bitmap or summary file, and zero it, for growfs.
*/
-STATIC int /* error */
+STATIC int
xfs_growfs_rt_alloc(
- xfs_mount_t *mp, /* file system mount point */
- xfs_extlen_t oblocks, /* old count of blocks */
- xfs_extlen_t nblocks, /* new count of blocks */
- xfs_inode_t *ip) /* inode (bitmap/summary) */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_extlen_t oblocks, /* old count of blocks */
+ xfs_extlen_t nblocks, /* new count of blocks */
+ struct xfs_inode *ip) /* inode (bitmap/summary) */
{
- xfs_fileoff_t bno; /* block number in file */
- xfs_buf_t *bp; /* temporary buffer for zeroing */
- int committed; /* transaction committed flag */
- xfs_daddr_t d; /* disk block address */
- int error; /* error return value */
- xfs_fsblock_t firstblock; /* first block allocated in xaction */
- xfs_bmap_free_t flist; /* list of freed blocks */
- xfs_fsblock_t fsbno; /* filesystem block for bno */
- xfs_bmbt_irec_t map; /* block map output */
- int nmap; /* number of block maps */
- int resblks; /* space reservation */
+ xfs_fileoff_t bno; /* block number in file */
+ struct xfs_buf *bp; /* temporary buffer for zeroing */
+ int committed; /* transaction committed flag */
+ xfs_daddr_t d; /* disk block address */
+ int error; /* error return value */
+ xfs_fsblock_t firstblock;/* first block allocated in xaction */
+ struct xfs_bmap_free flist; /* list of freed blocks */
+ xfs_fsblock_t fsbno; /* filesystem block for bno */
+ struct xfs_bmbt_irec map; /* block map output */
+ int nmap; /* number of block maps */
+ int resblks; /* space reservation */
+ struct xfs_trans *tp;
/*
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
- int cancelflags = 0;
- xfs_trans_t *tp;
-
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
/*
@@ -791,8 +789,7 @@ xfs_growfs_rt_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
resblks, 0);
if (error)
- goto error_cancel;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ goto out_trans_cancel;
/*
* Lock the inode.
*/
@@ -804,28 +801,26 @@ xfs_growfs_rt_alloc(
* Allocate blocks to the bitmap file.
*/
nmap = 1;
- cancelflags |= XFS_TRANS_ABORT;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, &firstblock,
resblks, &map, &nmap, &flist);
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
- goto error_cancel;
+ goto out_bmap_cancel;
/*
* Free any blocks freed up in the transaction, then commit.
*/
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
- goto error_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ goto out_bmap_cancel;
+ error = xfs_trans_commit(tp);
if (error)
- goto error;
+ return error;
/*
* Now we need to clear the allocated blocks.
* Do this one block per transaction, to keep it simple.
*/
- cancelflags = 0;
for (bno = map.br_startoff, fsbno = map.br_startblock;
bno < map.br_startoff + map.br_blockcount;
bno++, fsbno++) {
@@ -836,7 +831,7 @@ xfs_growfs_rt_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
0, 0);
if (error)
- goto error_cancel;
+ goto out_trans_cancel;
/*
* Lock the bitmap inode.
*/
@@ -850,27 +845,29 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0);
if (bp == NULL) {
error = -EIO;
-error_cancel:
- xfs_trans_cancel(tp, cancelflags);
- goto error;
+ goto out_trans_cancel;
}
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
* Commit the transaction.
*/
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
- goto error;
+ return error;
}
/*
* Go on to the next extent, if any.
*/
oblocks = map.br_startoff + map.br_blockcount;
}
+
return 0;
-error:
+out_bmap_cancel:
+ xfs_bmap_cancel(&flist);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
@@ -973,7 +970,6 @@ xfs_growfs_rt(
bmbno < nrbmblocks;
bmbno++) {
xfs_trans_t *tp;
- int cancelflags = 0;
*nmp = *mp;
nsbp = &nmp->m_sb;
@@ -1015,7 +1011,6 @@ xfs_growfs_rt(
mp->m_rbmip->i_d.di_size =
nsbp->sb_rbmblocks * nsbp->sb_blocksize;
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
- cancelflags |= XFS_TRANS_ABORT;
/*
* Get the summary inode into the transaction.
*/
@@ -1062,7 +1057,7 @@ xfs_growfs_rt(
nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
if (error) {
error_cancel:
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
break;
}
/*
@@ -1076,7 +1071,7 @@ error_cancel:
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
break;
}
diff --git a/kernel/fs/xfs/xfs_stats.c b/kernel/fs/xfs/xfs_stats.c
index f2240383d..8686df6c7 100644
--- a/kernel/fs/xfs/xfs_stats.c
+++ b/kernel/fs/xfs/xfs_stats.c
@@ -18,20 +18,21 @@
#include "xfs.h"
#include <linux/proc_fs.h>
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
+struct xstats xfsstats;
-static int counter_val(int idx)
+static int counter_val(struct xfsstats __percpu *stats, int idx)
{
int val = 0, cpu;
for_each_possible_cpu(cpu)
- val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+ val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx));
return val;
}
-static int xfs_stat_proc_show(struct seq_file *m, void *v)
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{
int i, j;
+ int len = 0;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -65,54 +66,59 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
};
/* Loop over all stats groups */
+
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- seq_printf(m, "%s", xstats[i].desc);
+ len += snprintf(buf + len, PATH_MAX - len, "%s",
+ xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- seq_printf(m, " %u", counter_val(j));
- seq_putc(m, '\n');
+ len += snprintf(buf + len, PATH_MAX - len, " %u",
+ counter_val(stats, j));
+ len += snprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
- xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
- xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
- xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
}
- seq_printf(m, "xpc %Lu %Lu %Lu\n",
+ len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- seq_printf(m, "debug %u\n",
+ len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
- return 0;
+
+ return len;
}
-static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+void xfs_stats_clearall(struct xfsstats __percpu *stats)
{
- return single_open(file, xfs_stat_proc_show, NULL);
+ int c;
+ __uint32_t vn_active;
+
+ xfs_notice(NULL, "Clearing xfsstats");
+ for_each_possible_cpu(c) {
+ preempt_disable();
+ /* save vn_active, it's a universal truth! */
+ vn_active = per_cpu_ptr(stats, c)->vn_active;
+ memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
+ per_cpu_ptr(stats, c)->vn_active = vn_active;
+ preempt_enable();
+ }
}
-static const struct file_operations xfs_stat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xfs_stat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
seq_printf(m, "%d\t%d\t%d\t%u\n",
- 0,
- counter_val(XFSSTAT_END_XQMSTAT),
- 0,
- counter_val(XFSSTAT_END_XQMSTAT + 1));
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
return 0;
}
@@ -136,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
seq_printf(m, "qm");
for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
- seq_printf(m, " %u", counter_val(j));
+ seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
}
@@ -155,44 +161,35 @@ static const struct file_operations xqmstat_proc_fops = {
};
#endif /* CONFIG_XFS_QUOTA */
+#ifdef CONFIG_PROC_FS
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
+ return -ENOMEM;
+
+ if (!proc_symlink("fs/xfs/stat", NULL,
+ "/sys/fs/xfs/stats/stats"))
goto out;
- if (!proc_create("fs/xfs/stat", 0, NULL,
- &xfs_stat_proc_fops))
- goto out_remove_xfs_dir;
#ifdef CONFIG_XFS_QUOTA
if (!proc_create("fs/xfs/xqmstat", 0, NULL,
&xqmstat_proc_fops))
- goto out_remove_stat_file;
+ goto out;
if (!proc_create("fs/xfs/xqm", 0, NULL,
&xqm_proc_fops))
- goto out_remove_xqmstat_file;
+ goto out;
#endif
return 0;
-#ifdef CONFIG_XFS_QUOTA
- out_remove_xqmstat_file:
- remove_proc_entry("fs/xfs/xqmstat", NULL);
- out_remove_stat_file:
- remove_proc_entry("fs/xfs/stat", NULL);
-#endif
- out_remove_xfs_dir:
- remove_proc_entry("fs/xfs", NULL);
- out:
+out:
+ remove_proc_subtree("fs/xfs", NULL);
return -ENOMEM;
}
void
xfs_cleanup_procfs(void)
{
-#ifdef CONFIG_XFS_QUOTA
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-#endif
- remove_proc_entry("fs/xfs/stat", NULL);
- remove_proc_entry("fs/xfs", NULL);
+ remove_proc_subtree("fs/xfs", NULL);
}
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/fs/xfs/xfs_stats.h b/kernel/fs/xfs/xfs_stats.h
index c8f238b82..483b0eff1 100644
--- a/kernel/fs/xfs/xfs_stats.h
+++ b/kernel/fs/xfs/xfs_stats.h
@@ -19,8 +19,6 @@
#define __XFS_STATS_H__
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
#include <linux/percpu.h>
/*
@@ -215,15 +213,29 @@ struct xfsstats {
__uint64_t xs_read_bytes;
};
-DECLARE_PER_CPU(struct xfsstats, xfsstats);
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
+void xfs_stats_clearall(struct xfsstats __percpu *stats);
+extern struct xstats xfsstats;
-/*
- * We don't disable preempt, not too worried about poking the
- * wrong CPU's stat for now (also aggregated before reporting).
- */
-#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++)
-#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
-#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
+#define XFS_STATS_INC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \
+} while (0)
+
+#define XFS_STATS_DEC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \
+} while (0)
+
+#define XFS_STATS_ADD(mp, v, inc) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \
+} while (0)
+
+#if defined(CONFIG_PROC_FS)
extern int xfs_init_procfs(void);
extern void xfs_cleanup_procfs(void);
@@ -231,10 +243,6 @@ extern void xfs_cleanup_procfs(void);
#else /* !CONFIG_PROC_FS */
-# define XFS_STATS_INC(count)
-# define XFS_STATS_DEC(count)
-# define XFS_STATS_ADD(count, inc)
-
static inline int xfs_init_procfs(void)
{
return 0;
diff --git a/kernel/fs/xfs/xfs_super.c b/kernel/fs/xfs/xfs_super.c
index 65a45372f..36bd8825b 100644
--- a/kernel/fs/xfs/xfs_super.c
+++ b/kernel/fs/xfs/xfs_super.c
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
+#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
+
/*
* Table driven mount option parser.
*
@@ -259,16 +261,8 @@ xfs_parseargs(
mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
if (!mp->m_rtname)
return -ENOMEM;
- } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (kstrtoint(value, 10, &iosize))
- return -EINVAL;
- iosizelog = ffs(iosize) - 1;
- } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+ } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
+ !strcmp(this_char, MNTOPT_BIOSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
@@ -363,6 +357,10 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
mp->m_flags &= ~XFS_MOUNT_DISCARD;
+#ifdef CONFIG_FS_DAX
+ } else if (!strcmp(this_char, MNTOPT_DAX)) {
+ mp->m_flags |= XFS_MOUNT_DAX;
+#endif
} else {
xfs_warn(mp, "unknown mount option [%s].", this_char);
return -EINVAL;
@@ -452,8 +450,8 @@ done:
}
struct proc_xfs_info {
- int flag;
- char *str;
+ uint64_t flag;
+ char *str;
};
STATIC int
@@ -474,6 +472,7 @@ xfs_showargs(
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
+ { XFS_MOUNT_DAX, "," MNTOPT_DAX },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -839,17 +838,18 @@ xfs_init_mount_workqueues(
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
+ mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -923,7 +923,7 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- XFS_STATS_INC(vn_reclaim);
+ XFS_STATS_INC(ip->i_mount, vn_reclaim);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -984,8 +984,8 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- XFS_STATS_INC(vn_rele);
- XFS_STATS_INC(vn_remove);
+ XFS_STATS_INC(ip->i_mount, vn_rele);
+ XFS_STATS_INC(ip->i_mount, vn_remove);
xfs_inactive(ip);
}
@@ -1475,9 +1475,16 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_workqueues;
+ /* Allocate stats memory before we do operations that might use it */
+ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!mp->m_stats.xs_stats) {
+ error = -ENOMEM;
+ goto out_destroy_counters;
+ }
+
error = xfs_readsb(mp, flags);
if (error)
- goto out_destroy_counters;
+ goto out_free_stats;
error = xfs_finish_flags(mp);
if (error)
@@ -1507,6 +1514,24 @@ xfs_fs_fill_super(
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
sb->s_flags |= MS_I_VERSION;
+ if (mp->m_flags & XFS_MOUNT_DAX) {
+ xfs_warn(mp,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ if (sb->s_blocksize != PAGE_SIZE) {
+ xfs_alert(mp,
+ "Filesystem block size invalid for DAX Turning DAX off.");
+ mp->m_flags &= ~XFS_MOUNT_DAX;
+ } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ xfs_alert(mp,
+ "Block device does not support DAX Turning DAX off.");
+ mp->m_flags &= ~XFS_MOUNT_DAX;
+ }
+ }
+
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ xfs_alert(mp,
+ "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1528,9 +1553,11 @@ xfs_fs_fill_super(
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
+ out_free_stats:
+ free_percpu(mp->m_stats.xs_stats);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
-out_destroy_workqueues:
+ out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
@@ -1557,6 +1584,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
+ free_percpu(mp->m_stats.xs_stats);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
@@ -1821,19 +1849,32 @@ init_xfs_fs(void)
xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
if (!xfs_kset) {
error = -ENOMEM;
- goto out_sysctl_unregister;;
+ goto out_sysctl_unregister;
}
+ xfsstats.xs_kobj.kobject.kset = xfs_kset;
+
+ xfsstats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!xfsstats.xs_stats) {
+ error = -ENOMEM;
+ goto out_kset_unregister;
+ }
+
+ error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
+ "stats");
+ if (error)
+ goto out_free_stats;
+
#ifdef DEBUG
xfs_dbg_kobj.kobject.kset = xfs_kset;
error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
if (error)
- goto out_kset_unregister;
+ goto out_remove_stats_kobj;
#endif
error = xfs_qm_init();
if (error)
- goto out_remove_kobj;
+ goto out_remove_dbg_kobj;
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1842,11 +1883,15 @@ init_xfs_fs(void)
out_qm_exit:
xfs_qm_exit();
- out_remove_kobj:
+ out_remove_dbg_kobj:
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
- out_kset_unregister:
+ out_remove_stats_kobj:
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ out_free_stats:
+ free_percpu(xfsstats.xs_stats);
+ out_kset_unregister:
kset_unregister(xfs_kset);
out_sysctl_unregister:
xfs_sysctl_unregister();
@@ -1872,6 +1917,8 @@ exit_xfs_fs(void)
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ free_percpu(xfsstats.xs_stats);
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
@@ -1879,6 +1926,7 @@ exit_xfs_fs(void)
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
+ xfs_uuid_table_free();
}
module_init(init_xfs_fs);
diff --git a/kernel/fs/xfs/xfs_symlink.c b/kernel/fs/xfs/xfs_symlink.c
index 40c076523..996481eeb 100644
--- a/kernel/fs/xfs/xfs_symlink.c
+++ b/kernel/fs/xfs/xfs_symlink.c
@@ -178,7 +178,6 @@ xfs_symlink(
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- uint cancel_flags;
int committed;
xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
@@ -224,7 +223,6 @@ xfs_symlink(
return error;
tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
@@ -239,12 +237,11 @@ xfs_symlink(
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+ XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
/*
@@ -292,7 +289,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
/*
@@ -394,7 +391,7 @@ xfs_symlink(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -407,9 +404,8 @@ xfs_symlink(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
@@ -426,7 +422,7 @@ out_release_inode:
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return error;
}
@@ -464,7 +460,7 @@ xfs_inactive_symlink_rmt(
tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -506,7 +502,7 @@ xfs_inactive_symlink_rmt(
/*
* Unmap the dead block(s) to the free_list.
*/
- error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+ error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps,
&first_block, &free_list, &done);
if (error)
goto error_bmap_cancel;
@@ -533,7 +529,7 @@ xfs_inactive_symlink_rmt(
/*
* Commit the transaction containing extent freeing and EFDs.
*/
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
goto error_unlock;
@@ -552,7 +548,7 @@ xfs_inactive_symlink_rmt(
error_bmap_cancel:
xfs_bmap_cancel(&free_list);
error_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
diff --git a/kernel/fs/xfs/xfs_sysctl.c b/kernel/fs/xfs/xfs_sysctl.c
index a0c8067ce..aed74d3f8 100644
--- a/kernel/fs/xfs/xfs_sysctl.c
+++ b/kernel/fs/xfs/xfs_sysctl.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "xfs_error.h"
+#include "xfs_stats.h"
static struct ctl_table_header *xfs_table_header;
@@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler(
size_t *lenp,
loff_t *ppos)
{
- int c, ret, *valp = ctl->data;
- __uint32_t vn_active;
+ int ret, *valp = ctl->data;
ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
if (!ret && write && *valp) {
- xfs_notice(NULL, "Clearing xfsstats");
- for_each_possible_cpu(c) {
- preempt_disable();
- /* save vn_active, it's a universal truth! */
- vn_active = per_cpu(xfsstats, c).vn_active;
- memset(&per_cpu(xfsstats, c), 0,
- sizeof(struct xfsstats));
- per_cpu(xfsstats, c).vn_active = vn_active;
- preempt_enable();
- }
+ xfs_stats_clearall(xfsstats.xs_stats);
xfs_stats_clear = 0;
}
diff --git a/kernel/fs/xfs/xfs_sysfs.c b/kernel/fs/xfs/xfs_sysfs.c
index aa0367085..ee70f5dec 100644
--- a/kernel/fs/xfs/xfs_sysfs.c
+++ b/kernel/fs/xfs/xfs_sysfs.c
@@ -21,11 +21,13 @@
#include "xfs_log_format.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_stats.h"
struct xfs_sysfs_attr {
struct attribute attr;
- ssize_t (*show)(char *buf, void *data);
- ssize_t (*store)(const char *buf, size_t count, void *data);
+ ssize_t (*show)(struct kobject *kobject, char *buf);
+ ssize_t (*store)(struct kobject *kobject, const char *buf,
+ size_t count);
};
static inline struct xfs_sysfs_attr *
@@ -38,6 +40,8 @@ to_attr(struct attribute *attr)
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
#define XFS_SYSFS_ATTR_RO(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define XFS_SYSFS_ATTR_WO(name) \
+ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
@@ -51,14 +55,42 @@ struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
};
+STATIC ssize_t
+xfs_sysfs_object_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
+}
+
+STATIC ssize_t
+xfs_sysfs_object_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
+}
+
+static const struct sysfs_ops xfs_sysfs_ops = {
+ .show = xfs_sysfs_object_show,
+ .store = xfs_sysfs_object_store,
+};
+
#ifdef DEBUG
/* debug */
STATIC ssize_t
log_recovery_delay_store(
+ struct kobject *kobject,
const char *buf,
- size_t count,
- void *data)
+ size_t count)
{
int ret;
int val;
@@ -77,8 +109,8 @@ log_recovery_delay_store(
STATIC ssize_t
log_recovery_delay_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
}
@@ -89,52 +121,87 @@ static struct attribute *xfs_dbg_attrs[] = {
NULL,
};
+struct kobj_type xfs_dbg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
+/* stats */
+
+static inline struct xstats *
+to_xstats(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xstats, xs_kobj);
+}
+
STATIC ssize_t
-xfs_dbg_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+stats_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xstats *stats = to_xstats(kobject);
- return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+ return xfs_stats_format(stats->xs_stats, buf);
}
+XFS_SYSFS_ATTR_RO(stats);
STATIC ssize_t
-xfs_dbg_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+stats_clear_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ int ret;
+ int val;
+ struct xstats *stats = to_xstats(kobject);
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
- return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+ if (val != 1)
+ return -EINVAL;
+
+ xfs_stats_clearall(stats->xs_stats);
+ return count;
}
+XFS_SYSFS_ATTR_WO(stats_clear);
-static struct sysfs_ops xfs_dbg_ops = {
- .show = xfs_dbg_show,
- .store = xfs_dbg_store,
+static struct attribute *xfs_stats_attrs[] = {
+ ATTR_LIST(stats),
+ ATTR_LIST(stats_clear),
+ NULL,
};
-struct kobj_type xfs_dbg_ktype = {
+struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_dbg_ops,
- .default_attrs = xfs_dbg_attrs,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_stats_attrs,
};
-#endif /* DEBUG */
-
/* xlog */
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xlog, l_kobj);
+}
+
STATIC ssize_t
log_head_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
spin_lock(&log->l_icloglock);
cycle = log->l_curr_cycle;
@@ -147,12 +214,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn);
STATIC ssize_t
log_tail_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
@@ -161,12 +228,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn);
STATIC ssize_t
reserve_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
+
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -175,12 +243,12 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head);
STATIC ssize_t
write_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -195,45 +263,8 @@ static struct attribute *xfs_log_attrs[] = {
NULL,
};
-static inline struct xlog *
-to_xlog(struct kobject *kobject)
-{
- struct xfs_kobj *kobj = to_kobj(kobject);
- return container_of(kobj, struct xlog, l_kobj);
-}
-
-STATIC ssize_t
-xfs_log_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
-{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
- return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
-}
-
-STATIC ssize_t
-xfs_log_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
-{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
- return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
-}
-
-static struct sysfs_ops xfs_log_ops = {
- .show = xfs_log_show,
- .store = xfs_log_store,
-};
-
struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_log_ops,
+ .sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_log_attrs,
};
diff --git a/kernel/fs/xfs/xfs_sysfs.h b/kernel/fs/xfs/xfs_sysfs.h
index 240eee35f..be692e599 100644
--- a/kernel/fs/xfs/xfs_sysfs.h
+++ b/kernel/fs/xfs/xfs_sysfs.h
@@ -22,6 +22,7 @@
extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern struct kobj_type xfs_dbg_ktype; /* debug */
extern struct kobj_type xfs_log_ktype; /* xlog */
+extern struct kobj_type xfs_stats_ktype; /* stats */
static inline struct xfs_kobj *
to_kobj(struct kobject *kobject)
diff --git a/kernel/fs/xfs/xfs_trace.h b/kernel/fs/xfs/xfs_trace.h
index 615781bf4..877079eb0 100644
--- a/kernel/fs/xfs/xfs_trace.h
+++ b/kernel/fs/xfs/xfs_trace.h
@@ -687,7 +687,9 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
@@ -738,6 +740,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
__entry->blocks, __entry->shift, __entry->writeio_blocks)
)
+TRACE_EVENT(xfs_irec_merge_pre,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+ TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ __field(xfs_agino_t, nagino)
+ __field(uint16_t, nholemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ __entry->nagino = nagino;
+ __entry->nholemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->agino, __entry->holemask, __entry->nagino,
+ __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask),
+ TP_ARGS(mp, agno, agino, holemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
+ MINOR(__entry->dev), __entry->agno, __entry->agino,
+ __entry->holemask)
+)
+
#define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
@@ -1264,6 +1313,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@@ -2042,6 +2092,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
+ TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f),
+ TP_ARGS(log, in_f),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, count)
+ __field(unsigned int, isize)
+ __field(xfs_agblock_t, length)
+ __field(unsigned int, gen)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->agno = be32_to_cpu(in_f->icl_ag);
+ __entry->agbno = be32_to_cpu(in_f->icl_agbno);
+ __entry->count = be32_to_cpu(in_f->icl_count);
+ __entry->isize = be32_to_cpu(in_f->icl_isize);
+ __entry->length = be32_to_cpu(in_f->icl_length);
+ __entry->gen = be32_to_cpu(in_f->icl_gen);
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u "
+ "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agbno, __entry->count, __entry->isize,
+ __entry->length, __entry->gen)
+)
+#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
+ TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \
+ TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel);
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover);
+
DECLARE_EVENT_CLASS(xfs_discard_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len),
diff --git a/kernel/fs/xfs/xfs_trans.c b/kernel/fs/xfs/xfs_trans.c
index 220ef2c90..748b16aff 100644
--- a/kernel/fs/xfs/xfs_trans.c
+++ b/kernel/fs/xfs/xfs_trans.c
@@ -113,7 +113,7 @@ xfs_trans_free(
* blocks. Locks and log items, however, are no inherited. They must
* be added to the new transaction explicitly.
*/
-xfs_trans_t *
+STATIC xfs_trans_t *
xfs_trans_dup(
xfs_trans_t *tp)
{
@@ -251,14 +251,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
- int log_flags;
-
- if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
- log_flags = XFS_LOG_REL_PERM_RESERV;
- } else {
- log_flags = 0;
- }
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+ xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -744,7 +737,7 @@ void
xfs_trans_free_items(
struct xfs_trans *tp,
xfs_lsn_t commit_lsn,
- int flags)
+ bool abort)
{
struct xfs_log_item_desc *lidp, *next;
@@ -755,7 +748,7 @@ xfs_trans_free_items(
if (commit_lsn != NULLCOMMITLSN)
lip->li_ops->iop_committing(lip, commit_lsn);
- if (flags & XFS_TRANS_ABORT)
+ if (abort)
lip->li_flags |= XFS_LI_ABORTED;
lip->li_ops->iop_unlock(lip);
@@ -892,27 +885,17 @@ xfs_trans_committed_bulk(
* have already been unlocked as if the commit had succeeded.
* Do not reference the transaction structure after this call.
*/
-int
-xfs_trans_commit(
+static int
+__xfs_trans_commit(
struct xfs_trans *tp,
- uint flags)
+ bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_lsn_t commit_lsn = -1;
int error = 0;
- int log_flags = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
/*
- * Determine whether this commit is releasing a permanent
- * log reservation or not.
- */
- if (flags & XFS_TRANS_RELEASE_LOG_RES) {
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- log_flags = XFS_LOG_REL_PERM_RESERV;
- }
-
- /*
* If there is nothing to be logged by the transaction,
* then unlock all of the items associated with the
* transaction and free the transaction structure.
@@ -936,7 +919,7 @@ xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+ xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
xfs_trans_free(tp);
@@ -947,9 +930,9 @@ xfs_trans_commit(
*/
if (sync) {
error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
- XFS_STATS_INC(xs_trans_sync);
+ XFS_STATS_INC(mp, xs_trans_sync);
} else {
- XFS_STATS_INC(xs_trans_async);
+ XFS_STATS_INC(mp, xs_trans_async);
}
return error;
@@ -964,18 +947,25 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (commit_lsn == -1 && !error)
error = -EIO;
}
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
- XFS_STATS_INC(xs_trans_empty);
+ XFS_STATS_INC(mp, xs_trans_empty);
return error;
}
+int
+xfs_trans_commit(
+ struct xfs_trans *tp)
+{
+ return __xfs_trans_commit(tp, false);
+}
+
/*
* Unlock all of the transaction's items and free the transaction.
* The transaction must not have modified any of its items, because
@@ -986,29 +976,22 @@ out_unreserve:
*/
void
xfs_trans_cancel(
- xfs_trans_t *tp,
- int flags)
+ struct xfs_trans *tp)
{
- int log_flags;
- xfs_mount_t *mp = tp->t_mountp;
+ struct xfs_mount *mp = tp->t_mountp;
+ bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
/*
- * See if the caller is being too lazy to figure out if
- * the transaction really needs an abort.
- */
- if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
- flags &= ~XFS_TRANS_ABORT;
- /*
* See if the caller is relying on us to shut down the
* filesystem. This happens in paths where we detect
* corruption and decide to give up.
*/
- if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
struct xfs_log_item_desc *lidp;
list_for_each_entry(lidp, &tp->t_items, lid_trans)
@@ -1018,34 +1001,28 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
- if (tp->t_ticket) {
- if (flags & XFS_TRANS_RELEASE_LOG_RES) {
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- log_flags = XFS_LOG_REL_PERM_RESERV;
- } else {
- log_flags = 0;
- }
- xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
- }
+ if (tp->t_ticket)
+ xfs_log_done(mp, tp->t_ticket, NULL, false);
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
xfs_trans_free(tp);
}
/*
* Roll from one trans in the sequence of PERMANENT transactions to
* the next: permanent transactions are only flushed out when
- * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * committed with xfs_trans_commit(), but we still want as soon
* as possible to let chunks of it go to the log. So we commit the
* chunk we've been working on and get a new transaction to continue.
*/
int
-xfs_trans_roll(
+__xfs_trans_roll(
struct xfs_trans **tpp,
- struct xfs_inode *dp)
+ struct xfs_inode *dp,
+ int *committed)
{
struct xfs_trans *trans;
struct xfs_trans_res tres;
@@ -1055,7 +1032,8 @@ xfs_trans_roll(
* Ensure that the inode is always logged.
*/
trans = *tpp;
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+ if (dp)
+ xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
/*
* Copy the critical parameters from one trans to the next.
@@ -1071,20 +1049,14 @@ xfs_trans_roll(
* is in progress. The caller takes the responsibility to cancel
* the duplicate transaction that gets returned.
*/
- error = xfs_trans_commit(trans, 0);
+ error = __xfs_trans_commit(trans, true);
if (error)
return error;
+ *committed = 1;
trans = *tpp;
/*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(trans->t_ticket);
-
-
- /*
* Reserve space in the log for th next transaction.
* This also pushes items in the "AIL", the list of logged items,
* out to disk if they are taking up space at the tail of the log
@@ -1100,6 +1072,16 @@ xfs_trans_roll(
if (error)
return error;
- xfs_trans_ijoin(trans, dp, 0);
+ if (dp)
+ xfs_trans_ijoin(trans, dp, 0);
return 0;
}
+
+int
+xfs_trans_roll(
+ struct xfs_trans **tpp,
+ struct xfs_inode *dp)
+{
+ int committed = 0;
+ return __xfs_trans_roll(tpp, dp, &committed);
+}
diff --git a/kernel/fs/xfs/xfs_trans.h b/kernel/fs/xfs/xfs_trans.h
index b5bc1ab3c..4643070d7 100644
--- a/kernel/fs/xfs/xfs_trans.h
+++ b/kernel/fs/xfs/xfs_trans.h
@@ -133,8 +133,6 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces that are
* actually macros.
*/
-#define xfs_trans_get_log_res(tp) ((tp)->t_log_res)
-#define xfs_trans_get_log_count(tp) ((tp)->t_log_count)
#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
@@ -153,7 +151,6 @@ typedef struct xfs_trans {
*/
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
uint, uint);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -216,7 +213,6 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
-void xfs_efi_release(struct xfs_efi_log_item *, uint);
void xfs_trans_log_efi_extent(xfs_trans_t *,
struct xfs_efi_log_item *,
xfs_fsblock_t,
@@ -224,13 +220,13 @@ void xfs_trans_log_efi_extent(xfs_trans_t *,
struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
struct xfs_efi_log_item *,
uint);
-void xfs_trans_log_efd_extent(xfs_trans_t *,
- struct xfs_efd_log_item *,
- xfs_fsblock_t,
- xfs_extlen_t);
-int xfs_trans_commit(xfs_trans_t *, uint flags);
+int xfs_trans_free_extent(struct xfs_trans *,
+ struct xfs_efd_log_item *, xfs_fsblock_t,
+ xfs_extlen_t);
+int xfs_trans_commit(struct xfs_trans *);
+int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-void xfs_trans_cancel(xfs_trans_t *, int);
+void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/kernel/fs/xfs/xfs_trans_ail.c b/kernel/fs/xfs/xfs_trans_ail.c
index 573aefb5a..4f18fd92c 100644
--- a/kernel/fs/xfs/xfs_trans_ail.c
+++ b/kernel/fs/xfs/xfs_trans_ail.c
@@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next(
{
struct xfs_log_item *lip = cur->item;
- if ((__psint_t)lip & 1)
+ if ((uintptr_t)lip & 1)
lip = xfs_ail_min(ailp);
if (lip)
cur->item = xfs_ail_next(ailp, lip);
@@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear(
list_for_each_entry(cur, &ailp->xa_cursors, list) {
if (cur->item == lip)
cur->item = (struct xfs_log_item *)
- ((__psint_t)cur->item | 1);
+ ((uintptr_t)cur->item | 1);
}
}
@@ -287,7 +287,7 @@ xfs_ail_splice(
* find the place in the AIL where the items belong.
*/
lip = cur ? cur->item : NULL;
- if (!lip || (__psint_t) lip & 1)
+ if (!lip || (uintptr_t)lip & 1)
lip = __xfs_trans_ail_cursor_last(ailp, lsn);
/*
@@ -349,7 +349,7 @@ xfsaild_push(
xfs_ail_min_lsn(ailp))) {
ailp->xa_log_flush = 0;
- XFS_STATS_INC(xs_push_ail_flush);
+ XFS_STATS_INC(mp, xs_push_ail_flush);
xfs_log_force(mp, XFS_LOG_SYNC);
}
@@ -371,7 +371,7 @@ xfsaild_push(
goto out_done;
}
- XFS_STATS_INC(xs_push_ail);
+ XFS_STATS_INC(mp, xs_push_ail);
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
@@ -385,7 +385,7 @@ xfsaild_push(
lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
- XFS_STATS_INC(xs_push_ail_success);
+ XFS_STATS_INC(mp, xs_push_ail_success);
trace_xfs_ail_push(lip);
ailp->xa_last_pushed_lsn = lsn;
@@ -403,7 +403,7 @@ xfsaild_push(
* re-try the flushing relatively soon if most of the
* AIL is beeing flushed.
*/
- XFS_STATS_INC(xs_push_ail_flushing);
+ XFS_STATS_INC(mp, xs_push_ail_flushing);
trace_xfs_ail_flushing(lip);
flushing++;
@@ -411,14 +411,14 @@ xfsaild_push(
break;
case XFS_ITEM_PINNED:
- XFS_STATS_INC(xs_push_ail_pinned);
+ XFS_STATS_INC(mp, xs_push_ail_pinned);
trace_xfs_ail_pinned(lip);
stuck++;
ailp->xa_log_flush++;
break;
case XFS_ITEM_LOCKED:
- XFS_STATS_INC(xs_push_ail_locked);
+ XFS_STATS_INC(mp, xs_push_ail_locked);
trace_xfs_ail_locked(lip);
stuck++;
diff --git a/kernel/fs/xfs/xfs_trans_dquot.c b/kernel/fs/xfs/xfs_trans_dquot.c
index 76a16df55..ce78534a0 100644
--- a/kernel/fs/xfs/xfs_trans_dquot.c
+++ b/kernel/fs/xfs/xfs_trans_dquot.c
@@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo(
xfs_trans_t *ntp)
{
xfs_dqtrx_t *oq, *nq;
- int i,j;
+ int i, j;
xfs_dqtrx_t *oqa, *nqa;
+ ulong blk_res_used;
if (!otp->t_dqinfo)
return;
@@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo(
* Because the quota blk reservation is carried forward,
* it is also necessary to carry forward the DQ_DIRTY flag.
*/
- if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+ if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
oqa = otp->t_dqinfo->dqs[j];
nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ blk_res_used = 0;
+
if (oqa[i].qt_dquot == NULL)
break;
oq = &oqa[i];
nq = &nqa[i];
+ if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
+ blk_res_used = oq->qt_bcount_delta;
+
nq->qt_dquot = oq->qt_dquot;
nq->qt_bcount_delta = nq->qt_icount_delta = 0;
nq->qt_rtbcount_delta = 0;
@@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo(
/*
* Transfer whatever is left of the reservations.
*/
- nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
- oq->qt_blk_res = oq->qt_blk_res_used;
+ nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
+ oq->qt_blk_res = blk_res_used;
nq->qt_rtblk_res = oq->qt_rtblk_res -
oq->qt_rtblk_res_used;
@@ -239,10 +245,6 @@ xfs_trans_mod_dquot(
* disk blocks used.
*/
case XFS_TRANS_DQ_BCOUNT:
- if (qtrx->qt_blk_res && delta > 0) {
- qtrx->qt_blk_res_used += (ulong)delta;
- ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
- }
qtrx->qt_bcount_delta += delta;
break;
@@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas(
* reservation that a transaction structure knows of.
*/
if (qtrx->qt_blk_res != 0) {
- if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
- if (qtrx->qt_blk_res >
- qtrx->qt_blk_res_used)
+ ulong blk_res_used = 0;
+
+ if (qtrx->qt_bcount_delta > 0)
+ blk_res_used = qtrx->qt_bcount_delta;
+
+ if (qtrx->qt_blk_res != blk_res_used) {
+ if (qtrx->qt_blk_res > blk_res_used)
dqp->q_res_bcount -= (xfs_qcnt_t)
(qtrx->qt_blk_res -
- qtrx->qt_blk_res_used);
+ blk_res_used);
else
dqp->q_res_bcount -= (xfs_qcnt_t)
- (qtrx->qt_blk_res_used -
+ (blk_res_used -
qtrx->qt_blk_res);
}
} else {
diff --git a/kernel/fs/xfs/xfs_trans_extfree.c b/kernel/fs/xfs/xfs_trans_extfree.c
index 284397dd7..a96ae540e 100644
--- a/kernel/fs/xfs/xfs_trans_extfree.c
+++ b/kernel/fs/xfs/xfs_trans_extfree.c
@@ -25,6 +25,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
/*
* This routine is called to allocate an "extent free intention"
@@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t *tp,
}
/*
- * This routine is called to indicate that the described
- * extent is to be logged as having been freed. It should
- * be called once for each extent freed.
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
*/
-void
-xfs_trans_log_efd_extent(xfs_trans_t *tp,
- xfs_efd_log_item_t *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len)
+int
+xfs_trans_free_extent(
+ struct xfs_trans *tp,
+ struct xfs_efd_log_item *efdp,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len)
{
uint next_extent;
- xfs_extent_t *extp;
+ struct xfs_extent *extp;
+ int error;
+ error = xfs_free_extent(tp, start_block, ext_len);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
tp->t_flags |= XFS_TRANS_DIRTY;
efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
@@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp,
extp->ext_start = start_block;
extp->ext_len = ext_len;
efdp->efd_next_extent++;
+
+ return error;
}
diff --git a/kernel/fs/xfs/xfs_trans_inode.c b/kernel/fs/xfs/xfs_trans_inode.c
index 17280cd71..b97f1df91 100644
--- a/kernel/fs/xfs/xfs_trans_inode.c
+++ b/kernel/fs/xfs/xfs_trans_inode.c
@@ -108,6 +108,15 @@ xfs_trans_log_inode(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
+ * Record the specific change for fdatasync optimisation. This
+ * allows fdatasync to skip log forces for inodes that are only
+ * timestamp dirty. We do this before the change count so that
+ * the core being logged in this case does not impact on fdatasync
+ * behaviour.
+ */
+ ip->i_itemp->ili_fsync_fields |= flags;
+
+ /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. We don't use
* inode_inc_version() because there is no need for extra locking around
diff --git a/kernel/fs/xfs/xfs_trans_priv.h b/kernel/fs/xfs/xfs_trans_priv.h
index bd1281862..49931b72d 100644
--- a/kernel/fs/xfs/xfs_trans_priv.h
+++ b/kernel/fs/xfs/xfs_trans_priv.h
@@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *);
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
- int flags);
+ bool abort);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
@@ -119,6 +119,21 @@ xfs_trans_ail_delete(
xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
}
+static inline void
+xfs_trans_ail_remove(
+ struct xfs_log_item *lip,
+ int shutdown_type)
+{
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_delete() drops the AIL lock */
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(ailp, lip, shutdown_type);
+ else
+ spin_unlock(&ailp->xa_lock);
+}
+
void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_ail_push_all(struct xfs_ail *);
void xfs_ail_push_all_sync(struct xfs_ail *);
diff --git a/kernel/fs/xfs/xfs_xattr.c b/kernel/fs/xfs/xfs_xattr.c
index c03681518..839b35ca2 100644
--- a/kernel/fs/xfs/xfs_xattr.c
+++ b/kernel/fs/xfs/xfs_xattr.c
@@ -32,9 +32,10 @@
static int
-xfs_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int xflags)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *value, size_t size)
{
+ int xflags = handler->flags;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error, asize = size;
@@ -53,11 +54,35 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
return asize;
}
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name,
+ int xflags)
+{
+ /*
+ * Invalidate any cached ACLs if the user has bypassed the ACL
+ * interface. We don't validate the content whatsoever so it is caller
+ * responsibility to provide data in valid format and ensure i_mode is
+ * consistent.
+ */
+ if (xflags & ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+#endif
+ }
+}
+
static int
-xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int xflags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *value, size_t size, int flags)
{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int xflags = handler->flags;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int error;
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -70,8 +95,12 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
if (!value)
return xfs_attr_remove(ip, (unsigned char *)name, xflags);
- return xfs_attr_set(ip, (unsigned char *)name,
+ error = xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
+ if (!error)
+ xfs_forget_acl(d_inode(dentry), name, xflags);
+
+ return error;
}
static const struct xattr_handler xfs_xattr_user_handler = {