summaryrefslogtreecommitdiffstats
path: root/kernel/include/linux/ceph
diff options
context:
space:
mode:
authorYunhong Jiang <yunhong.jiang@intel.com>2015-08-04 12:17:53 -0700
committerYunhong Jiang <yunhong.jiang@intel.com>2015-08-04 15:44:42 -0700
commit9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 (patch)
tree1c9cafbcd35f783a87880a10f85d1a060db1a563 /kernel/include/linux/ceph
parent98260f3884f4a202f9ca5eabed40b1354c489b29 (diff)
Add the rt linux 4.1.3-rt3 as base
Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Diffstat (limited to 'kernel/include/linux/ceph')
-rw-r--r--kernel/include/linux/ceph/auth.h142
-rw-r--r--kernel/include/linux/ceph/buffer.h37
-rw-r--r--kernel/include/linux/ceph/ceph_debug.h38
-rw-r--r--kernel/include/linux/ceph/ceph_features.h119
-rw-r--r--kernel/include/linux/ceph/ceph_frag.h109
-rw-r--r--kernel/include/linux/ceph/ceph_fs.h763
-rw-r--r--kernel/include/linux/ceph/ceph_hash.h13
-rw-r--r--kernel/include/linux/ceph/debugfs.h27
-rw-r--r--kernel/include/linux/ceph/decode.h259
-rw-r--r--kernel/include/linux/ceph/libceph.h230
-rw-r--r--kernel/include/linux/ceph/mdsmap.h63
-rw-r--r--kernel/include/linux/ceph/messenger.h303
-rw-r--r--kernel/include/linux/ceph/mon_client.h119
-rw-r--r--kernel/include/linux/ceph/msgpool.h26
-rw-r--r--kernel/include/linux/ceph/msgr.h185
-rw-r--r--kernel/include/linux/ceph/osd_client.h377
-rw-r--r--kernel/include/linux/ceph/osdmap.h224
-rw-r--r--kernel/include/linux/ceph/pagelist.h80
-rw-r--r--kernel/include/linux/ceph/rados.h469
-rw-r--r--kernel/include/linux/ceph/types.h29
20 files changed, 3612 insertions, 0 deletions
diff --git a/kernel/include/linux/ceph/auth.h b/kernel/include/linux/ceph/auth.h
new file mode 100644
index 000000000..260d78b58
--- /dev/null
+++ b/kernel/include/linux/ceph/auth.h
@@ -0,0 +1,142 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys. These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+struct ceph_msg;
+
+struct ceph_auth_handshake {
+ struct ceph_authorizer *authorizer;
+ void *authorizer_buf;
+ size_t authorizer_buf_len;
+ void *authorizer_reply_buf;
+ size_t authorizer_reply_buf_len;
+ int (*sign_message)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+ int (*check_message_signature)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+};
+
+struct ceph_auth_client_ops {
+ const char *name;
+
+ /*
+ * true if we are authenticated and can connect to
+ * services.
+ */
+ int (*is_authenticated)(struct ceph_auth_client *ac);
+
+ /*
+ * true if we should (re)authenticate, e.g., when our tickets
+ * are getting old and crusty.
+ */
+ int (*should_authenticate)(struct ceph_auth_client *ac);
+
+ /*
+ * build requests and process replies during monitor
+ * handshake. if handle_reply returns -EAGAIN, we build
+ * another request.
+ */
+ int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+ int (*handle_reply)(struct ceph_auth_client *ac, int result,
+ void *buf, void *end);
+
+ /*
+ * Create authorizer for connecting to a service, and verify
+ * the response to authenticate the service.
+ */
+ int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth);
+ /* ensure that an existing authorizer is up to date */
+ int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth);
+ int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a, size_t len);
+ void (*destroy_authorizer)(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a);
+ void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+ int peer_type);
+
+ /* reset when we (re)connect to a monitor */
+ void (*reset)(struct ceph_auth_client *ac);
+
+ void (*destroy)(struct ceph_auth_client *ac);
+
+ int (*sign_message)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+ int (*check_message_signature)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+};
+
+struct ceph_auth_client {
+ u32 protocol; /* CEPH_AUTH_* */
+ void *private; /* for use by protocol implementation */
+ const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
+
+ bool negotiating; /* true if negotiating protocol */
+ const char *name; /* entity name */
+ u64 global_id; /* our unique id in system */
+ const struct ceph_crypto_key *key; /* our secret key */
+ unsigned want_keys; /* which services we want */
+
+ struct mutex mutex;
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+ void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+ void *buf, size_t len,
+ void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+ void *msg_buf, size_t msg_len);
+
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
+ int peer_type,
+ struct ceph_auth_handshake *auth);
+extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a);
+extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
+ int peer_type,
+ struct ceph_auth_handshake *a);
+extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ size_t len);
+extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type);
+
+static inline int ceph_auth_sign_message(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ if (auth->sign_message)
+ return auth->sign_message(auth, msg);
+ return 0;
+}
+
+static inline
+int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ if (auth->check_message_signature)
+ return auth->check_message_signature(auth, msg);
+ return 0;
+}
+#endif
diff --git a/kernel/include/linux/ceph/buffer.h b/kernel/include/linux/ceph/buffer.h
new file mode 100644
index 000000000..07ca15e76
--- /dev/null
+++ b/kernel/include/linux/ceph/buffer.h
@@ -0,0 +1,37 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for smaller sizes, vmalloc for larger sizes.
+ */
+struct ceph_buffer {
+ struct kref kref;
+ struct kvec vec;
+ size_t alloc_len;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+ kref_get(&b->kref);
+ return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+ kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
+#endif
diff --git a/kernel/include/linux/ceph/ceph_debug.h b/kernel/include/linux/ceph/ceph_debug.h
new file mode 100644
index 000000000..aa2e19182
--- /dev/null
+++ b/kernel/include/linux/ceph/ceph_debug.h
@@ -0,0 +1,38 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+# define dout(fmt, ...) \
+ pr_debug("%.*s %12.12s:%-4d : " fmt, \
+ 8 - (int)sizeof(KBUILD_MODNAME), " ", \
+ ceph_file_part(__FILE__, sizeof(__FILE__)), \
+ __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+# define dout(fmt, ...) do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ } while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/kernel/include/linux/ceph/ceph_features.h b/kernel/include/linux/ceph/ceph_features.h
new file mode 100644
index 000000000..4763ad64e
--- /dev/null
+++ b/kernel/include/linux/ceph/ceph_features.h
@@ -0,0 +1,119 @@
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_UID (1ULL<<0)
+#define CEPH_FEATURE_NOSRCADDR (1ULL<<1)
+#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2)
+#define CEPH_FEATURE_FLOCK (1ULL<<3)
+#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4)
+#define CEPH_FEATURE_MONNAMES (1ULL<<5)
+#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6)
+#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7)
+#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8)
+#define CEPH_FEATURE_PGID64 (1ULL<<9)
+#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10)
+#define CEPH_FEATURE_PGPOOL3 (1ULL<<11)
+#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12)
+#define CEPH_FEATURE_OSDENC (1ULL<<13)
+#define CEPH_FEATURE_OMAP (1ULL<<14)
+#define CEPH_FEATURE_MONENC (1ULL<<15)
+#define CEPH_FEATURE_QUERY_T (1ULL<<16)
+#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17)
+#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18)
+#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19)
+#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20)
+#define CEPH_FEATURE_MON_GV (1ULL<<21)
+#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22)
+#define CEPH_FEATURE_MSG_AUTH (1ULL<<23)
+#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24)
+#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25)
+#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26)
+#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27)
+#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28)
+#define CEPH_FEATURE_MDSENC (1ULL<<29)
+#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30)
+#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31)
+#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
+#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
+#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
+#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
+#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
+#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
+#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
+#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
+/* The process supports new-style OSDMap encoding. Monitors also use
+ this bit to determine if peers support NAK messages. */
+#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39)
+#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40)
+#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41)
+#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */
+#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42)
+#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43)
+#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44)
+#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45)
+#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46)
+#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */
+#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */
+#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */
+#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47)
+#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */
+#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
+// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
+#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
+
+/*
+ * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
+ * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63
+ * to mean 33 bit ~0, and introduce a helper below to do the
+ * translation.
+ *
+ * This was introduced by ceph.git commit
+ * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8
+ * and fixed by ceph.git commit
+ * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c
+ */
+#define CEPH_FEATURE_RESERVED (1ULL<<63)
+
+static inline u64 ceph_sanitize_features(u64 features)
+{
+ if (features & CEPH_FEATURE_RESERVED) {
+ /* everything through OSD_SNAPMAPPER */
+ return 0x1ffffffffull;
+ } else {
+ return features;
+ }
+}
+
+/*
+ * Features supported.
+ */
+#define CEPH_FEATURES_SUPPORTED_DEFAULT \
+ (CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_RECONNECT_SEQ | \
+ CEPH_FEATURE_PGID64 | \
+ CEPH_FEATURE_PGPOOL3 | \
+ CEPH_FEATURE_OSDENC | \
+ CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_MSG_AUTH | \
+ CEPH_FEATURE_CRUSH_TUNABLES2 | \
+ CEPH_FEATURE_REPLY_CREATE_INODE | \
+ CEPH_FEATURE_OSDHASHPSPOOL | \
+ CEPH_FEATURE_OSD_CACHEPOOL | \
+ CEPH_FEATURE_CRUSH_V2 | \
+ CEPH_FEATURE_EXPORT_PEER | \
+ CEPH_FEATURE_OSDMAP_ENC | \
+ CEPH_FEATURE_CRUSH_TUNABLES3 | \
+ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
+ CEPH_FEATURE_CRUSH_V4)
+
+#define CEPH_FEATURES_REQUIRED_DEFAULT \
+ (CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_RECONNECT_SEQ | \
+ CEPH_FEATURE_PGID64 | \
+ CEPH_FEATURE_PGPOOL3 | \
+ CEPH_FEATURE_OSDENC)
+
+#endif
diff --git a/kernel/include/linux/ceph/ceph_frag.h b/kernel/include/linux/ceph/ceph_frag.h
new file mode 100644
index 000000000..5babb8e95
--- /dev/null
+++ b/kernel/include/linux/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask. Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ * 8 upper bits = "bits"
+ * 24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value. This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically. However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+ return (b << 24) |
+ (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+ return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+ return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+ return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+ return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+ return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+ /* is sub as specific as us, and contained by us? */
+ return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+ (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f) - 1,
+ ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1,
+ ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+ int newbits = ceph_frag_bits(f) + by;
+ return ceph_frag_make(newbits,
+ ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+ return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+ return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/kernel/include/linux/ceph/ceph_fs.h b/kernel/include/linux/ceph/ceph_fs.h
new file mode 100644
index 000000000..d7d072a25
--- /dev/null
+++ b/kernel/include/linux/ceph/ceph_fs.h
@@ -0,0 +1,763 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include <linux/ceph/msgr.h>
+#include <linux/ceph/rados.h>
+
+/*
+ * subprotocol versions. when specific messages types or high-level
+ * protocols change, bump the affected components. we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSDC_PROTOCOL 24 /* server/client */
+#define CEPH_MDSC_PROTOCOL 32 /* server/client */
+#define CEPH_MONC_PROTOCOL 15 /* server/client */
+
+
+#define CEPH_INO_ROOT 1
+#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON 31
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ __le32 fl_stripe_count; /* over this many objects */
+ __le32 fl_object_size; /* until objects are this big, then move to
+ new objects */
+ __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
+
+ /* object -> pg layout */
+ __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
+ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+ ((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+ ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_pool(l) \
+ ((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+ return le32_to_cpu(l->fl_stripe_unit) *
+ le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+ return le32_to_cpu(l->fl_object_size) *
+ le32_to_cpu(l->fl_stripe_count);
+}
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+struct ceph_dir_layout {
+ __u8 dl_dir_hash; /* see ceph_hash.h for ids */
+ __u8 dl_unused1;
+ __u16 dl_unused2;
+ __u32 dl_unused3;
+} __attribute__ ((packed));
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES 0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN 0x0
+#define CEPH_AUTH_NONE 0x1
+#define CEPH_AUTH_CEPHX 0x2
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN 1
+#define CEPH_MSG_PING 2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP 4
+#define CEPH_MSG_MON_GET_MAP 5
+#define CEPH_MSG_STATFS 13
+#define CEPH_MSG_STATFS_REPLY 14
+#define CEPH_MSG_MON_SUBSCRIBE 15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
+#define CEPH_MSG_AUTH 17
+#define CEPH_MSG_AUTH_REPLY 18
+#define CEPH_MSG_MON_GET_VERSION 19
+#define CEPH_MSG_MON_GET_VERSION_REPLY 20
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP 21
+
+#define CEPH_MSG_CLIENT_SESSION 22
+#define CEPH_MSG_CLIENT_RECONNECT 23
+
+#define CEPH_MSG_CLIENT_REQUEST 24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_CAPS 0x310
+#define CEPH_MSG_CLIENT_LEASE 0x311
+#define CEPH_MSG_CLIENT_SNAP 0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY 48
+#define CEPH_MSG_POOLOP 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP 41
+#define CEPH_MSG_OSD_OP 42
+#define CEPH_MSG_OSD_OPREPLY 43
+#define CEPH_MSG_WATCH_NOTIFY 44
+
+
+/* watch-notify operations */
+enum {
+ WATCH_NOTIFY = 1, /* notifying watcher */
+ WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+};
+
+
+struct ceph_mon_request_header {
+ __le64 have_version;
+ __le16 session_mon;
+ __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+ __le64 kb, kb_used, kb_avail;
+ __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+ struct ceph_fsid fsid;
+ __le64 version;
+ struct ceph_statfs st;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+ struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */
+
+struct ceph_mon_subscribe_item {
+ __le64 have_version; __le64 have;
+ __u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+ __le32 duration; /* seconds */
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
+
+/*
+ * mds states
+ * > 0 -> in
+ * <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
+ empty log. */
+#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
+ operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ * - these are bitmasks.. we can compose them
+ * - they also define the lock ordering by the MDS
+ * - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DVERSION 1
+#define CEPH_LOCK_DN 2
+#define CEPH_LOCK_ISNAP 16
+#define CEPH_LOCK_IVERSION 32 /* mds internal */
+#define CEPH_LOCK_IFILE 64
+#define CEPH_LOCK_IAUTH 128
+#define CEPH_LOCK_ILINK 256
+#define CEPH_LOCK_IDFT 512 /* dir frag tree */
+#define CEPH_LOCK_INEST 1024 /* mds internal */
+#define CEPH_LOCK_IXATTR 2048
+#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
+#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
+
+/* client_session ops */
+enum {
+ CEPH_SESSION_REQUEST_OPEN,
+ CEPH_SESSION_OPEN,
+ CEPH_SESSION_REQUEST_CLOSE,
+ CEPH_SESSION_CLOSE,
+ CEPH_SESSION_REQUEST_RENEWCAPS,
+ CEPH_SESSION_RENEWCAPS,
+ CEPH_SESSION_STALE,
+ CEPH_SESSION_RECALL_STATE,
+ CEPH_SESSION_FLUSHMSG,
+ CEPH_SESSION_FLUSHMSG_ACK,
+ CEPH_SESSION_FORCE_RO,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+ __le32 op;
+ __le64 seq;
+ struct ceph_timespec stamp;
+ __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ * & 0x001000 -> write op
+ * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ & & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE 0x001000
+enum {
+ CEPH_MDS_OP_LOOKUP = 0x00100,
+ CEPH_MDS_OP_GETATTR = 0x00101,
+ CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+ CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+ CEPH_MDS_OP_LOOKUPINO = 0x00104,
+ CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+
+ CEPH_MDS_OP_SETXATTR = 0x01105,
+ CEPH_MDS_OP_RMXATTR = 0x01106,
+ CEPH_MDS_OP_SETLAYOUT = 0x01107,
+ CEPH_MDS_OP_SETATTR = 0x01108,
+ CEPH_MDS_OP_SETFILELOCK= 0x01109,
+ CEPH_MDS_OP_GETFILELOCK= 0x00110,
+ CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
+
+ CEPH_MDS_OP_MKNOD = 0x01201,
+ CEPH_MDS_OP_LINK = 0x01202,
+ CEPH_MDS_OP_UNLINK = 0x01203,
+ CEPH_MDS_OP_RENAME = 0x01204,
+ CEPH_MDS_OP_MKDIR = 0x01220,
+ CEPH_MDS_OP_RMDIR = 0x01221,
+ CEPH_MDS_OP_SYMLINK = 0x01222,
+
+ CEPH_MDS_OP_CREATE = 0x01301,
+ CEPH_MDS_OP_OPEN = 0x00302,
+ CEPH_MDS_OP_READDIR = 0x00305,
+
+ CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+ CEPH_MDS_OP_MKSNAP = 0x01400,
+ CEPH_MDS_OP_RMSNAP = 0x01401,
+ CEPH_MDS_OP_LSSNAP = 0x00402,
+ CEPH_MDS_OP_RENAMESNAP = 0x01403,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE 1
+#define CEPH_SETATTR_UID 2
+#define CEPH_SETATTR_GID 4
+#define CEPH_SETATTR_MTIME 8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE 32
+#define CEPH_SETATTR_CTIME 64
+
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE (1 << 0)
+#define CEPH_XATTR_REPLACE (1 << 1)
+#define CEPH_XATTR_REMOVE (1 << 31)
+
+union ceph_mds_request_args {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ __le32 max_bytes;
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 file_replication;
+ __le32 unused; /* used to be preferred osd */
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout layout;
+ } __attribute__ ((packed)) setlayout;
+ struct {
+ __u8 rule; /* currently fcntl or flock */
+ __u8 type; /* shared, exclusive, remove*/
+ __le64 owner; /* owner of the lock */
+ __le64 pid; /* process id requesting the lock */
+ __le64 start; /* initial location to lock */
+ __le64 length; /* num bytes to lock from start */
+ __u8 wait; /* will caller wait for lock to become available? */
+ } __attribute__ ((packed)) filelock_change;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+
+struct ceph_mds_request_head {
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+ __le64 ino, cap_id; /* ino and unique cap id */
+ __le32 caps, wanted; /* new issued, wanted */
+ __le32 seq, issue_seq, mseq;
+ __le32 dname_seq; /* if releasing a dentry lease, a */
+ __le32 dname_len; /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+ __le32 op;
+ __le32 result;
+ __le32 mdsmap_epoch;
+ __u8 safe; /* true if committed to disk */
+ __u8 is_dentry, is_target; /* true if dentry, target inode records
+ are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+ __le32 frag; /* this frag splits... */
+ __le32 by; /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+ __le32 nsplits; /* num ceph_frag_tree_split records */
+ struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+ __le32 caps, wanted; /* caps issued, wanted */
+ __le64 cap_id;
+ __le32 seq, mseq;
+ __le64 realm; /* snap realm */
+ __u8 flags; /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+ __le64 ino;
+ __le64 snapid;
+ __le32 rdev;
+ __le64 version; /* inode version */
+ __le64 xattr_version; /* version for xattr blob */
+ struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+ struct ceph_file_layout layout;
+ struct ceph_timespec ctime, mtime, atime;
+ __le32 time_warp_seq;
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ __le32 mode, uid, gid;
+ __le32 nlink;
+ __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
+ struct ceph_timespec rctime;
+ struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, symlink string, dir layout, xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+ __le16 mask; /* lease type(s) */
+ __le32 duration_ms; /* lease duration */
+ __le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+ __le32 frag; /* fragment */
+ __le32 auth; /* auth mds, if this is a delegation point */
+ __le32 ndist; /* number of mds' this is replicated on */
+ __le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL 1
+#define CEPH_LOCK_FLOCK 2
+#define CEPH_LOCK_FCNTL_INTR 3
+#define CEPH_LOCK_FLOCK_INTR 4
+
+
+#define CEPH_LOCK_SHARED 1
+#define CEPH_LOCK_EXCL 2
+#define CEPH_LOCK_UNLOCK 4
+
+struct ceph_filelock {
+ __le64 start;/* file offset to start lock at */
+ __le64 length; /* num bytes to lock; 0 for all following start */
+ __le64 client; /* which client holds the lock */
+ __le64 owner; /* owner the lock */
+ __le64 pid; /* process id holding the lock on the client */
+ __u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN 0
+#define CEPH_FILE_MODE_RD 1
+#define CEPH_FILE_MODE_WR 2
+#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
+#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
+#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+#define CEPH_INLINE_NONE ((__u64)-1)
+
+/* capability bits */
+#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED 1 /* client can reads */
+#define CEPH_CAP_GEXCL 2 /* client can read and update */
+#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
+#define CEPH_CAP_GRD 8 /* (file) client can read */
+#define CEPH_CAP_GWR 16 /* (file) client can write */
+#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+
+#define CEPH_CAP_SIMPLE_BITS 2
+#define CEPH_CAP_FILE_BITS 8
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH 2
+#define CEPH_CAP_SLINK 4
+#define CEPH_CAP_SXATTR 6
+#define CEPH_CAP_SFILE 8
+#define CEPH_CAP_SFLOCK 20
+
+#define CEPH_CAP_BITS 22
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
+#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
+#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
+
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
+#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
+ CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_XATTR_SHARED)
+#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_FILE_RD)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_XATTR_SHARED | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
+ CEPH_CAP_LINK_EXCL | \
+ CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+ CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+ CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+ CEPH_CAP_OP_GRANT, /* mds->client grant */
+ CEPH_CAP_OP_REVOKE, /* mds->client revoke */
+ CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
+ CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
+ CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
+ CEPH_CAP_OP_UPDATE, /* client->mds update */
+ CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
+ CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
+ CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
+ CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
+ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
+ CEPH_CAP_OP_RENEW, /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+ __le32 op; /* CEPH_CAP_OP_* */
+ __le64 ino, realm;
+ __le64 cap_id;
+ __le32 seq, issue_seq;
+ __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+ __le32 migrate_seq;
+ __le64 snap_follows;
+ __le32 snap_trace_len;
+
+ /* authlock */
+ __le32 uid, gid, mode;
+
+ /* linklock */
+ __le32 nlink;
+
+ /* xattrlock */
+ __le32 xattr_len;
+ __le64 xattr_version;
+
+ /* filelock */
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ struct ceph_timespec mtime, atime, ctime;
+ struct ceph_file_layout layout;
+ __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_peer {
+ __le64 cap_id;
+ __le32 seq;
+ __le32 mseq;
+ __le32 mds;
+ __u8 flags;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+ __le32 num; /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+ __le64 ino;
+ __le64 cap_id;
+ __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
+#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
+#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
+#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+ __u8 action; /* CEPH_MDS_LEASE_* */
+ __le16 mask; /* which lease */
+ __le64 ino;
+ __le64 first, last; /* snap range */
+ __le32 seq;
+ __le32 duration_ms; /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+ __le32 flock_len; /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 size;
+ struct ceph_timespec mtime, atime;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+ __le64 ino; /* snap realm base */
+ __le64 seq; /* snap seq for this snap realm */
+ __le64 parent; /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+ CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
+ CEPH_SNAP_OP_CREATE,
+ CEPH_SNAP_OP_DESTROY,
+ CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+ __le32 op; /* CEPH_SNAP_OP_* */
+ __le64 split; /* ino to split off, if any */
+ __le32 num_split_inos; /* # inos belonging to new child realm */
+ __le32 num_split_realms; /* # child realms udner new child realm */
+ __le32 trace_len; /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+ __le64 ino; /* ino */
+ __le64 created; /* snap: when created */
+ __le64 parent; /* ino: parent realm */
+ __le64 parent_since; /* snap: same parent since */
+ __le64 seq; /* snap: version */
+ __le32 num_snaps;
+ __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/kernel/include/linux/ceph/ceph_hash.h b/kernel/include/linux/ceph/ceph_hash.h
new file mode 100644
index 000000000..d099c3f90
--- /dev/null
+++ b/kernel/include/linux/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/kernel/include/linux/ceph/debugfs.h b/kernel/include/linux/ceph/debugfs.h
new file mode 100644
index 000000000..29cf897cc
--- /dev/null
+++ b/kernel/include/linux/ceph/debugfs.h
@@ -0,0 +1,27 @@
+#ifndef _FS_CEPH_DEBUGFS_H
+#define _FS_CEPH_DEBUGFS_H
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/types.h>
+
+#define CEPH_DEFINE_SHOW_FUNC(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, name, inode->i_private); \
+} \
+ \
+static const struct file_operations name##_fops = { \
+ .open = name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+};
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+#endif
+
diff --git a/kernel/include/linux/ceph/decode.h b/kernel/include/linux/ceph/decode.h
new file mode 100644
index 000000000..a6ef9cc26
--- /dev/null
+++ b/kernel/include/linux/ceph/decode.h
@@ -0,0 +1,259 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <linux/err.h>
+#include <linux/bug.h>
+#include <linux/time.h>
+#include <asm/unaligned.h>
+
+#include <linux/ceph/types.h>
+
+/*
+ * in all cases,
+ * void **p pointer to position pointer
+ * void *end pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+ u64 v = get_unaligned_le64(*p);
+ *p += sizeof(u64);
+ return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+ u32 v = get_unaligned_le32(*p);
+ *p += sizeof(u32);
+ return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+ u16 v = get_unaligned_le16(*p);
+ *p += sizeof(u16);
+ return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+ u8 v = *(u8 *)*p;
+ (*p)++;
+ return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+ memcpy(pv, *p, n);
+ *p += n;
+}
+
+/*
+ * bounds check input.
+ */
+static inline int ceph_has_room(void **p, void *end, size_t n)
+{
+ return end >= *p && n <= end - *p;
+}
+
+#define ceph_decode_need(p, end, n, bad) \
+ do { \
+ if (!likely(ceph_has_room(p, end, n))) \
+ goto bad; \
+ } while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u64), bad); \
+ v = ceph_decode_64(p); \
+ } while (0)
+#define ceph_decode_32_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u32), bad); \
+ v = ceph_decode_32(p); \
+ } while (0)
+#define ceph_decode_16_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u16), bad); \
+ v = ceph_decode_16(p); \
+ } while (0)
+#define ceph_decode_8_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u8), bad); \
+ v = ceph_decode_8(p); \
+ } while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad) \
+ do { \
+ ceph_decode_need(p, end, n, bad); \
+ ceph_decode_copy(p, pv, n); \
+ } while (0)
+
+/*
+ * Allocate a buffer big enough to hold the wire-encoded string, and
+ * decode the string into it. The resulting string will always be
+ * terminated with '\0'. If successful, *p will be advanced
+ * past the decoded data. Also, if lenp is not a null pointer, the
+ * length (not including the terminating '\0') will be recorded in
+ * *lenp. Note that a zero-length string is a valid return value.
+ *
+ * Returns a pointer to the newly-allocated string buffer, or a
+ * pointer-coded errno if an error occurs. Neither *p nor *lenp
+ * will have been updated if an error is returned.
+ *
+ * There are two possible failures:
+ * - converting the string would require accessing memory at or
+ * beyond the "end" pointer provided (-ERANGE)
+ * - memory could not be allocated for the result (-ENOMEM)
+ */
+static inline char *ceph_extract_encoded_string(void **p, void *end,
+ size_t *lenp, gfp_t gfp)
+{
+ u32 len;
+ void *sp = *p;
+ char *buf;
+
+ ceph_decode_32_safe(&sp, end, len, bad);
+ if (!ceph_has_room(&sp, end, len))
+ goto bad;
+
+ buf = kmalloc(len + 1, gfp);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ if (len)
+ memcpy(buf, sp, len);
+ buf[len] = '\0';
+
+ *p = (char *) *p + sizeof (u32) + len;
+ if (lenp)
+ *lenp = (size_t) len;
+
+ return buf;
+
+bad:
+ return ERR_PTR(-ERANGE);
+}
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+ const struct ceph_timespec *tv)
+{
+ ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec);
+ ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+ const struct timespec *ts)
+{
+ tv->tv_sec = cpu_to_le32((u32)ts->tv_sec);
+ tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+ __be16 ss_family = htons(a->in_addr.ss_family);
+ a->in_addr.ss_family = *(__u16 *)&ss_family;
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+ __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
+ a->in_addr.ss_family = ntohs(ss_family);
+ WARN_ON(a->in_addr.ss_family == 512);
+}
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+ put_unaligned_le64(v, (__le64 *)*p);
+ *p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+ put_unaligned_le32(v, (__le32 *)*p);
+ *p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+ put_unaligned_le16(v, (__le16 *)*p);
+ *p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+ *(u8 *)*p = v;
+ (*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+ memcpy(*p, s, len);
+ *p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+ u64 ino, const char *path)
+{
+ u32 len = path ? strlen(path) : 0;
+ BUG_ON(*p + 1 + sizeof(ino) + sizeof(len) + len > end);
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, ino);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, path, len);
+ *p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+ const char *s, u32 len)
+{
+ BUG_ON(*p + sizeof(len) + len > end);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, s, len);
+ *p += len;
+}
+
+#define ceph_encode_need(p, end, n, bad) \
+ do { \
+ if (!likely(ceph_has_room(p, end, n))) \
+ goto bad; \
+ } while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u64), bad); \
+ ceph_encode_64(p, v); \
+ } while (0)
+#define ceph_encode_32_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u32), bad); \
+ ceph_encode_32(p, v); \
+ } while (0)
+#define ceph_encode_16_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u16), bad); \
+ ceph_encode_16(p, v); \
+ } while (0)
+#define ceph_encode_8_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u8), bad); \
+ ceph_encode_8(p, v); \
+ } while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad) \
+ do { \
+ ceph_encode_need(p, end, n, bad); \
+ ceph_encode_copy(p, pv, n); \
+ } while (0)
+#define ceph_encode_string_safe(p, end, s, n, bad) \
+ do { \
+ ceph_encode_need(p, end, n, bad); \
+ ceph_encode_string(p, end, s, n); \
+ } while (0)
+
+
+#endif
diff --git a/kernel/include/linux/ceph/libceph.h b/kernel/include/linux/ceph/libceph.h
new file mode 100644
index 000000000..30f92cefa
--- /dev/null
+++ b/kernel/include/linux/ceph/libceph.h
@@ -0,0 +1,230 @@
+#ifndef _FS_CEPH_LIBCEPH_H
+#define _FS_CEPH_LIBCEPH_H
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/bug.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/ceph_fs.h>
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID (1<<0)
+#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
+#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
+#define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */
+#define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */
+
+#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY)
+
+#define ceph_set_opt(client, opt) \
+ (client)->options->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+ (!!((client)->options->flags & CEPH_OPT_##opt))
+
+struct ceph_options {
+ int flags;
+ struct ceph_fsid fsid;
+ struct ceph_entity_addr my_addr;
+ int mount_timeout;
+ int osd_idle_ttl;
+ int osd_keepalive_timeout;
+
+ /*
+ * any type that can't be simply compared or doesn't need need
+ * to be compared should go beyond this point,
+ * ceph_compare_options() should be updated accordingly
+ */
+
+ struct ceph_entity_addr *mon_addr; /* should be the first
+ pointer type of args */
+ int num_mon;
+ char *name;
+ struct ceph_crypto_key *key;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
+#define CEPH_OSD_KEEPALIVE_DEFAULT 5
+#define CEPH_OSD_IDLE_TTL_DEFAULT 60
+
+#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
+
+#define CEPH_AUTH_NAME_DEFAULT "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file. Delay a minimum amount of time, even if we send a cap
+ * message for some other reason. Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
+
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
+
+/* mount state */
+enum {
+ CEPH_MOUNT_MOUNTING,
+ CEPH_MOUNT_MOUNTED,
+ CEPH_MOUNT_UNMOUNTING,
+ CEPH_MOUNT_UNMOUNTED,
+ CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+ BUG_ON(time_after(b, a));
+ return (long)a - (long)b;
+}
+
+struct ceph_mds_client;
+
+/*
+ * per client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+ struct ceph_fsid fsid;
+ bool have_fsid;
+
+ void *private;
+
+ struct ceph_options *options;
+
+ struct mutex mount_mutex; /* serialize mount attempts */
+ wait_queue_head_t auth_wq;
+ int auth_err;
+
+ int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
+
+ u64 supported_features;
+ u64 required_features;
+
+ struct ceph_messenger msgr; /* messenger instance */
+ struct ceph_mon_client monc;
+ struct ceph_osd_client osdc;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_dir;
+ struct dentry *debugfs_monmap;
+ struct dentry *debugfs_osdmap;
+ struct dentry *debugfs_options;
+#endif
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data. It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+ atomic_t nref;
+ u64 seq;
+ u32 num_snaps;
+ u64 snaps[];
+};
+
+extern struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
+ gfp_t gfp_flags);
+extern struct ceph_snap_context *ceph_get_snap_context(
+ struct ceph_snap_context *sc);
+extern void ceph_put_snap_context(struct ceph_snap_context *sc);
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+ return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+ (off >> PAGE_CACHE_SHIFT);
+}
+
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+/* ceph_common.c */
+extern bool libceph_compatible(void *data);
+
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+extern void *ceph_kvmalloc(size_t size, gfp_t flags);
+
+extern struct ceph_options *ceph_parse_options(char *options,
+ const char *dev_name, const char *dev_name_end,
+ int (*parse_extra_token)(char *c, void *private),
+ void *private);
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client);
+extern void ceph_destroy_options(struct ceph_options *opt);
+extern int ceph_compare_options(struct ceph_options *new_opt,
+ struct ceph_client *client);
+extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
+ void *private,
+ u64 supported_features,
+ u64 required_features);
+extern u64 ceph_client_id(struct ceph_client *client);
+extern void ceph_destroy_client(struct ceph_client *client);
+extern int __ceph_open_session(struct ceph_client *client,
+ unsigned long started);
+extern int ceph_open_session(struct ceph_client *client);
+
+/* pagevec.c */
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+extern struct page **ceph_get_direct_page_vector(const void __user *data,
+ int num_pages,
+ bool write_page);
+extern void ceph_put_page_vector(struct page **pages, int num_pages,
+ bool dirty);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+extern int ceph_copy_user_to_page_vector(struct page **pages,
+ const void __user *data,
+ loff_t off, size_t len);
+extern void ceph_copy_to_page_vector(struct page **pages,
+ const void *data,
+ loff_t off, size_t len);
+extern void ceph_copy_from_page_vector(struct page **pages,
+ void *data,
+ loff_t off, size_t len);
+extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
+
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/kernel/include/linux/ceph/mdsmap.h b/kernel/include/linux/ceph/mdsmap.h
new file mode 100644
index 000000000..87ed09f54
--- /dev/null
+++ b/kernel/include/linux/ceph/mdsmap.h
@@ -0,0 +1,63 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include <linux/bug.h>
+#include <linux/ceph/types.h>
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+ u64 global_id;
+ struct ceph_entity_addr addr;
+ s32 state;
+ int num_export_targets;
+ bool laggy;
+ u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+ u32 m_epoch, m_client_epoch, m_last_failure;
+ u32 m_root;
+ u32 m_session_timeout; /* seconds */
+ u32 m_session_autoclose; /* seconds */
+ u64 m_max_file_size;
+ u32 m_max_mds; /* size of m_addr, m_state arrays */
+ struct ceph_mds_info *m_info;
+
+ /* which object pools file data can be stored in */
+ int m_num_data_pg_pools;
+ u64 *m_data_pg_pools;
+ u64 m_cas_pg_pool;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+ if (w >= m->m_max_mds)
+ return NULL;
+ return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+ BUG_ON(w < 0);
+ if (w >= m->m_max_mds)
+ return CEPH_MDS_STATE_DNE;
+ return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+ if (w >= 0 && w < m->m_max_mds)
+ return m->m_info[w].laggy;
+ return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+
+#endif
diff --git a/kernel/include/linux/ceph/messenger.h b/kernel/include/linux/ceph/messenger.h
new file mode 100644
index 000000000..e15499422
--- /dev/null
+++ b/kernel/include/linux/ceph/messenger.h
@@ -0,0 +1,303 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/blk_types.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/workqueue.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+struct ceph_msg;
+struct ceph_connection;
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+ struct ceph_connection *(*get)(struct ceph_connection *);
+ void (*put)(struct ceph_connection *);
+
+ /* handle an incoming message. */
+ void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+ /* authorize an outgoing connection */
+ struct ceph_auth_handshake *(*get_authorizer) (
+ struct ceph_connection *con,
+ int *proto, int force_new);
+ int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+ int (*invalidate_authorizer)(struct ceph_connection *con);
+
+ /* there was some error on the socket (disconnect, whatever) */
+ void (*fault) (struct ceph_connection *con);
+
+ /* a remote host as terminated a message exchange session, and messages
+ * we sent (or they tried to send us) may be lost. */
+ void (*peer_reset) (struct ceph_connection *con);
+
+ struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip);
+ int (*sign_message) (struct ceph_connection *con, struct ceph_msg *msg);
+
+ int (*check_message_signature) (struct ceph_connection *con,
+ struct ceph_msg *msg);
+};
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+ struct ceph_entity_inst inst; /* my name+address */
+ struct ceph_entity_addr my_enc_addr;
+
+ atomic_t stopping;
+ bool nocrc;
+ bool tcp_nodelay;
+
+ /*
+ * the global_seq counts connections i (attempt to) initiate
+ * in order to disambiguate certain connect race conditions.
+ */
+ u32 global_seq;
+ spinlock_t global_seq_lock;
+
+ u64 supported_features;
+ u64 required_features;
+};
+
+enum ceph_msg_data_type {
+ CEPH_MSG_DATA_NONE, /* message contains no data payload */
+ CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */
+ CEPH_MSG_DATA_PAGELIST, /* data source/destination is a pagelist */
+#ifdef CONFIG_BLOCK
+ CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */
+#endif /* CONFIG_BLOCK */
+};
+
+static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
+{
+ switch (type) {
+ case CEPH_MSG_DATA_NONE:
+ case CEPH_MSG_DATA_PAGES:
+ case CEPH_MSG_DATA_PAGELIST:
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+#endif /* CONFIG_BLOCK */
+ return true;
+ default:
+ return false;
+ }
+}
+
+struct ceph_msg_data {
+ struct list_head links; /* ceph_msg->data */
+ enum ceph_msg_data_type type;
+ union {
+#ifdef CONFIG_BLOCK
+ struct {
+ struct bio *bio;
+ size_t bio_length;
+ };
+#endif /* CONFIG_BLOCK */
+ struct {
+ struct page **pages; /* NOT OWNER. */
+ size_t length; /* total # bytes */
+ unsigned int alignment; /* first page */
+ };
+ struct ceph_pagelist *pagelist;
+ };
+};
+
+struct ceph_msg_data_cursor {
+ size_t total_resid; /* across all data items */
+ struct list_head *data_head; /* = &ceph_msg->data */
+
+ struct ceph_msg_data *data; /* current data item */
+ size_t resid; /* bytes not yet consumed */
+ bool last_piece; /* current is last piece */
+ bool need_crc; /* crc update needed */
+ union {
+#ifdef CONFIG_BLOCK
+ struct { /* bio */
+ struct bio *bio; /* bio from list */
+ struct bvec_iter bvec_iter;
+ };
+#endif /* CONFIG_BLOCK */
+ struct { /* pages */
+ unsigned int page_offset; /* offset in page */
+ unsigned short page_index; /* index in array */
+ unsigned short page_count; /* pages in array */
+ };
+ struct { /* pagelist */
+ struct page *page; /* page from list */
+ size_t offset; /* bytes from list */
+ };
+ };
+};
+
+/*
+ * a single message. it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+ struct ceph_msg_header hdr; /* header */
+ union {
+ struct ceph_msg_footer footer; /* footer */
+ struct ceph_msg_footer_old old_footer; /* old format footer */
+ };
+ struct kvec front; /* unaligned blobs of message */
+ struct ceph_buffer *middle;
+
+ size_t data_length;
+ struct list_head data;
+ struct ceph_msg_data_cursor cursor;
+
+ struct ceph_connection *con;
+ struct list_head list_head; /* links for connection lists */
+
+ struct kref kref;
+ bool more_to_follow;
+ bool needs_out_seq;
+ int front_alloc_len;
+ unsigned long ack_stamp; /* tx: when we were acked */
+
+ struct ceph_msgpool *pool;
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL (HZ/2)
+#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+ void *private;
+
+ const struct ceph_connection_operations *ops;
+
+ struct ceph_messenger *msgr;
+
+ atomic_t sock_state;
+ struct socket *sock;
+ struct ceph_entity_addr peer_addr; /* peer address */
+ struct ceph_entity_addr peer_addr_for_me;
+
+ unsigned long flags;
+ unsigned long state;
+ const char *error_msg; /* error message, if any */
+
+ struct ceph_entity_name peer_name; /* peer name */
+
+ u64 peer_features;
+ u32 connect_seq; /* identify the most recent connection
+ attempt for this connection, client */
+ u32 peer_global_seq; /* peer's global seq for this connection */
+
+ int auth_retry; /* true if we need a newer authorizer */
+ void *auth_reply_buf; /* where to put the authorizer reply */
+ int auth_reply_buf_len;
+
+ struct mutex mutex;
+
+ /* out queue */
+ struct list_head out_queue;
+ struct list_head out_sent; /* sending or sent but unacked */
+ u64 out_seq; /* last message queued for send */
+
+ u64 in_seq, in_seq_acked; /* last message received, acked */
+
+ /* connection negotiation temps */
+ char in_banner[CEPH_BANNER_MAX_LEN];
+ struct ceph_msg_connect out_connect;
+ struct ceph_msg_connect_reply in_reply;
+ struct ceph_entity_addr actual_peer_addr;
+
+ /* message out temps */
+ struct ceph_msg *out_msg; /* sending message (== tail of
+ out_sent) */
+ bool out_msg_done;
+
+ struct kvec out_kvec[8], /* sending header/footer data */
+ *out_kvec_cur;
+ int out_kvec_left; /* kvec's left in out_kvec */
+ int out_skip; /* skip this many bytes */
+ int out_kvec_bytes; /* total bytes left */
+ bool out_kvec_is_msg; /* kvec refers to out_msg */
+ int out_more; /* there is more data after the kvecs */
+ __le64 out_temp_ack; /* for writing an ack */
+
+ /* message in temps */
+ struct ceph_msg_header in_hdr;
+ struct ceph_msg *in_msg;
+ u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
+
+ char in_tag; /* protocol control byte */
+ int in_base_pos; /* bytes read */
+ __le64 in_temp_ack; /* for reading an ack */
+
+ struct delayed_work work; /* send|recv work */
+ unsigned long delay; /* current delay interval */
+};
+
+
+extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
+
+extern void ceph_messenger_init(struct ceph_messenger *msgr,
+ struct ceph_entity_addr *myaddr,
+ u64 supported_features,
+ u64 required_features,
+ bool nocrc,
+ bool tcp_nodelay);
+
+extern void ceph_con_init(struct ceph_connection *con, void *private,
+ const struct ceph_connection_operations *ops,
+ struct ceph_messenger *msgr);
+extern void ceph_con_open(struct ceph_connection *con,
+ __u8 entity_type, __u64 entity_num,
+ struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+
+extern void ceph_msg_revoke(struct ceph_msg *msg);
+extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
+
+extern void ceph_con_keepalive(struct ceph_connection *con);
+
+extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
+ size_t length, size_t alignment);
+extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
+ struct ceph_pagelist *pagelist);
+#ifdef CONFIG_BLOCK
+extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
+ size_t length);
+#endif /* CONFIG_BLOCK */
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+ bool can_fail);
+
+extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg);
+extern void ceph_msg_put(struct ceph_msg *msg);
+
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
+#endif
diff --git a/kernel/include/linux/ceph/mon_client.h b/kernel/include/linux/ceph/mon_client.h
new file mode 100644
index 000000000..81810dc21
--- /dev/null
+++ b/kernel/include/linux/ceph/mon_client.h
@@ -0,0 +1,119 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+
+#include <linux/ceph/messenger.h>
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+ struct ceph_fsid fsid;
+ u32 epoch;
+ u32 num_mon;
+ struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_generic_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+ int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+ struct ceph_mon_client *monc;
+ struct delayed_work delayed_work;
+ unsigned long delay;
+ ceph_monc_request_func_t do_request;
+};
+
+/*
+ * ceph_mon_generic_request is being used for the statfs and
+ * mon_get_version requests which are being done a bit differently
+ * because we need to get data back to the caller
+ */
+struct ceph_mon_generic_request {
+ struct kref kref;
+ u64 tid;
+ struct rb_node node;
+ int result;
+ void *buf;
+ struct completion completion;
+ struct ceph_msg *request; /* original request */
+ struct ceph_msg *reply; /* and reply */
+};
+
+struct ceph_mon_client {
+ struct ceph_client *client;
+ struct ceph_monmap *monmap;
+
+ struct mutex mutex;
+ struct delayed_work delayed_work;
+
+ struct ceph_auth_client *auth;
+ struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
+ int pending_auth;
+
+ bool hunting;
+ int cur_mon; /* last monitor i contacted */
+ unsigned long sub_sent, sub_renew_after;
+ struct ceph_connection con;
+
+ /* pending generic requests */
+ struct rb_root generic_request_tree;
+ int num_generic_requests;
+ u64 last_tid;
+
+ /* mds/osd map */
+ int want_mdsmap;
+ int want_next_osdmap; /* 1 = want, 2 = want+asked */
+ u32 have_osdmap, have_mdsmap;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+};
+
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+ struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map. We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+ unsigned long timeout);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+ struct ceph_statfs *buf);
+
+extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
+ const char *what, u64 *newest);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
+#endif
diff --git a/kernel/include/linux/ceph/msgpool.h b/kernel/include/linux/ceph/msgpool.h
new file mode 100644
index 000000000..4b0d38960
--- /dev/null
+++ b/kernel/include/linux/ceph/msgpool.h
@@ -0,0 +1,26 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include <linux/mempool.h>
+#include <linux/ceph/messenger.h>
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+ const char *name;
+ mempool_t *pool;
+ int type; /* preallocated message type */
+ int front_len; /* preallocated payload size */
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
+ int front_len, int size, bool blocking,
+ const char *name);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+ int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
diff --git a/kernel/include/linux/ceph/msgr.h b/kernel/include/linux/ceph/msgr.h
new file mode 100644
index 000000000..1c1887206
--- /dev/null
+++ b/kernel/include/linux/ceph/msgr.h
@@ -0,0 +1,185 @@
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT 6789 /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST 6789
+#define CEPH_PORT_START 6800 /* non-monitors start here */
+#define CEPH_PORT_LAST 6900
+
+/*
+ * tcp connection banner. include a protocol version. and adjust
+ * whenever the wire protocol changes. try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+ return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+ __u8 type; /* CEPH_ENTITY_TYPE_* */
+ __le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON 0x01
+#define CEPH_ENTITY_TYPE_MDS 0x02
+#define CEPH_ENTITY_TYPE_OSD 0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_AUTH 0x20
+
+#define CEPH_ENTITY_TYPE_ANY 0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+ __le32 type;
+ __le32 nonce; /* unique id for process (e.g. pid) */
+ struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+ struct ceph_entity_name name;
+ struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
+ incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
+ with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
+ with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
+#define CEPH_MSGR_TAG_MSG 7 /* message */
+#define CEPH_MSGR_TAG_ACK 8 /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
+#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+ __le64 features; /* supported feature bits */
+ __le32 host_type; /* CEPH_ENTITY_TYPE_* */
+ __le32 global_seq; /* count connections initiated by this host */
+ __le32 connect_seq; /* count connections initiated in this session */
+ __le32 protocol_version;
+ __le32 authorizer_protocol;
+ __le32 authorizer_len;
+ __u8 flags; /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+ __u8 tag;
+ __le64 features; /* feature bits for this session */
+ __le32 global_seq;
+ __le32 connect_seq;
+ __le32 protocol_version;
+ __le32 authorizer_len;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_inst src, orig_src;
+ __le32 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_name src;
+ __le16 compat_version;
+ __le16 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW 64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH 196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer_old {
+ __le32 front_crc, middle_crc, data_crc;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_msg_footer {
+ __le32 front_crc, middle_crc, data_crc;
+ // sig holds the 64 bits of the digital signature for the message PLR
+ __le64 sig;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */
+
+
+#endif
diff --git a/kernel/include/linux/ceph/osd_client.h b/kernel/include/linux/ceph/osd_client.h
new file mode 100644
index 000000000..61b19c46b
--- /dev/null
+++ b/kernel/include/linux/ceph/osd_client.h
@@ -0,0 +1,377 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+ struct ceph_msg *);
+typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+ atomic_t o_ref;
+ struct ceph_osd_client *o_osdc;
+ int o_osd;
+ int o_incarnation;
+ struct rb_node o_node;
+ struct ceph_connection o_con;
+ struct list_head o_requests;
+ struct list_head o_linger_requests;
+ struct list_head o_osd_lru;
+ struct ceph_auth_handshake o_auth;
+ unsigned long lru_ttl;
+ int o_marked_for_keepalive;
+ struct list_head o_keepalive_item;
+};
+
+
+#define CEPH_OSD_MAX_OP 3
+
+enum ceph_osd_data_type {
+ CEPH_OSD_DATA_TYPE_NONE = 0,
+ CEPH_OSD_DATA_TYPE_PAGES,
+ CEPH_OSD_DATA_TYPE_PAGELIST,
+#ifdef CONFIG_BLOCK
+ CEPH_OSD_DATA_TYPE_BIO,
+#endif /* CONFIG_BLOCK */
+};
+
+struct ceph_osd_data {
+ enum ceph_osd_data_type type;
+ union {
+ struct {
+ struct page **pages;
+ u64 length;
+ u32 alignment;
+ bool pages_from_pool;
+ bool own_pages;
+ };
+ struct ceph_pagelist *pagelist;
+#ifdef CONFIG_BLOCK
+ struct {
+ struct bio *bio; /* list of bios */
+ size_t bio_length; /* total in list */
+ };
+#endif /* CONFIG_BLOCK */
+ };
+};
+
+struct ceph_osd_req_op {
+ u16 op; /* CEPH_OSD_OP_* */
+ u32 flags; /* CEPH_OSD_OP_FLAG_* */
+ u32 payload_len;
+ union {
+ struct ceph_osd_data raw_data_in;
+ struct {
+ u64 offset, length;
+ u64 truncate_size;
+ u32 truncate_seq;
+ struct ceph_osd_data osd_data;
+ } extent;
+ struct {
+ u32 name_len;
+ u32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ struct ceph_osd_data osd_data;
+ } xattr;
+ struct {
+ const char *class_name;
+ const char *method_name;
+ struct ceph_osd_data request_info;
+ struct ceph_osd_data request_data;
+ struct ceph_osd_data response_data;
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ } cls;
+ struct {
+ u64 cookie;
+ u64 ver;
+ u32 prot_ver;
+ u32 timeout;
+ __u8 flag;
+ } watch;
+ struct {
+ u64 expected_object_size;
+ u64 expected_write_size;
+ } alloc_hint;
+ };
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+ u64 r_tid; /* unique for this client */
+ struct rb_node r_node;
+ struct list_head r_req_lru_item;
+ struct list_head r_osd_item;
+ struct list_head r_linger_item;
+ struct list_head r_linger_osd_item;
+ struct ceph_osd *r_osd;
+ struct ceph_pg r_pgid;
+ int r_pg_osds[CEPH_PG_MAX_SIZE];
+ int r_num_pg_osds;
+
+ struct ceph_msg *r_request, *r_reply;
+ int r_flags; /* any additional flags for the osd */
+ u32 r_sent; /* >0 if r_request is sending/sent */
+
+ /* request osd ops array */
+ unsigned int r_num_ops;
+ struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP];
+
+ /* these are updated on each send */
+ __le32 *r_request_osdmap_epoch;
+ __le32 *r_request_flags;
+ __le64 *r_request_pool;
+ void *r_request_pgid;
+ __le32 *r_request_attempts;
+ bool r_paused;
+ struct ceph_eversion *r_request_reassert_version;
+
+ int r_result;
+ int r_reply_op_len[CEPH_OSD_MAX_OP];
+ s32 r_reply_op_result[CEPH_OSD_MAX_OP];
+ int r_got_reply;
+ int r_linger;
+
+ struct ceph_osd_client *r_osdc;
+ struct kref r_kref;
+ bool r_mempool;
+ struct completion r_completion, r_safe_completion;
+ ceph_osdc_callback_t r_callback;
+ ceph_osdc_unsafe_callback_t r_unsafe_callback;
+ struct ceph_eversion r_reassert_version;
+ struct list_head r_unsafe_item;
+
+ struct inode *r_inode; /* for use by callbacks */
+ void *r_priv; /* ditto */
+
+ struct ceph_object_locator r_base_oloc;
+ struct ceph_object_id r_base_oid;
+ struct ceph_object_locator r_target_oloc;
+ struct ceph_object_id r_target_oid;
+
+ u64 r_snapid;
+ unsigned long r_stamp; /* send OR check time */
+
+ struct ceph_snap_context *r_snapc; /* snap context for writes */
+};
+
+struct ceph_request_redirect {
+ struct ceph_object_locator oloc;
+};
+
+struct ceph_osd_event {
+ u64 cookie;
+ int one_shot;
+ struct ceph_osd_client *osdc;
+ void (*cb)(u64, u64, u8, void *);
+ void *data;
+ struct rb_node node;
+ struct list_head osd_node;
+ struct kref kref;
+};
+
+struct ceph_osd_event_work {
+ struct work_struct work;
+ struct ceph_osd_event *event;
+ u64 ver;
+ u64 notify_id;
+ u8 opcode;
+};
+
+struct ceph_osd_client {
+ struct ceph_client *client;
+
+ struct ceph_osdmap *osdmap; /* current map */
+ struct rw_semaphore map_sem;
+ struct completion map_waiters;
+ u64 last_requested_map;
+
+ struct mutex request_mutex;
+ struct rb_root osds; /* osds */
+ struct list_head osd_lru; /* idle osds */
+ u64 timeout_tid; /* tid of timeout triggering rq */
+ u64 last_tid; /* tid of last request */
+ struct rb_root requests; /* pending requests */
+ struct list_head req_lru; /* in-flight lru */
+ struct list_head req_unsent; /* unsent/need-resend queue */
+ struct list_head req_notarget; /* map to no osd */
+ struct list_head req_linger; /* lingering requests */
+ int num_requests;
+ struct delayed_work timeout_work;
+ struct delayed_work osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+
+ mempool_t *req_mempool;
+
+ struct ceph_msgpool msgpool_op;
+ struct ceph_msgpool msgpool_op_reply;
+
+ spinlock_t event_lock;
+ struct rb_root event_tree;
+ u64 event_count;
+
+ struct workqueue_struct *notify_wq;
+};
+
+extern int ceph_osdc_setup(void);
+extern void ceph_osdc_cleanup(void);
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+ struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg);
+
+extern void osd_req_op_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode);
+
+extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+
+extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode,
+ u64 offset, u64 length,
+ u64 truncate_size, u32 truncate_seq);
+extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 length);
+
+extern struct ceph_osd_data *osd_req_op_extent_osd_data(
+ struct ceph_osd_request *osd_req,
+ unsigned int which);
+extern struct ceph_osd_data *osd_req_op_cls_response_data(
+ struct ceph_osd_request *osd_req,
+ unsigned int which);
+
+extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
+ unsigned int which,
+ struct ceph_pagelist *pagelist);
+#ifdef CONFIG_BLOCK
+extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
+ unsigned int which,
+ struct bio *bio, size_t bio_length);
+#endif /* CONFIG_BLOCK */
+
+extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
+ unsigned int which,
+ struct ceph_pagelist *pagelist);
+extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+
+extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode,
+ const char *class, const char *method);
+extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode, const char *name, const void *value,
+ size_t size, u8 cmp_op, u8 cmp_mode);
+extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode,
+ u64 cookie, u64 version, int flag);
+extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ u64 expected_object_size,
+ u64 expected_write_size);
+
+extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+ struct ceph_snap_context *snapc,
+ unsigned int num_ops,
+ bool use_mempool,
+ gfp_t gfp_flags);
+
+extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
+ struct ceph_snap_context *snapc,
+ u64 snap_id,
+ struct timespec *mtime);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 offset, u64 *len,
+ unsigned int which, int num_ops,
+ int opcode, int flags,
+ struct ceph_snap_context *snapc,
+ u32 truncate_seq, u64 truncate_size,
+ bool use_mempool);
+
+extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+
+extern void ceph_osdc_get_request(struct ceph_osd_request *req);
+extern void ceph_osdc_put_request(struct ceph_osd_request *req);
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail);
+extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int nr_pages,
+ int page_align);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *sc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec *mtime,
+ struct page **pages, int nr_pages);
+
+/* watch/notify events */
+extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
+ void (*event_cb)(u64, u64, u8, void *),
+ void *data, struct ceph_osd_event **pevent);
+extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
+extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+#endif
+
diff --git a/kernel/include/linux/ceph/osdmap.h b/kernel/include/linux/ceph/osdmap.h
new file mode 100644
index 000000000..e55c08bc3
--- /dev/null
+++ b/kernel/include/linux/ceph/osdmap.h
@@ -0,0 +1,224 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/ceph_fs.h>
+#include <linux/crush/crush.h>
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds. That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg {
+ uint64_t pool;
+ uint32_t seed;
+};
+
+#define CEPH_POOL_FLAG_HASHPSPOOL 1
+
+struct ceph_pg_pool_info {
+ struct rb_node node;
+ s64 id;
+ u8 type;
+ u8 size;
+ u8 crush_ruleset;
+ u8 object_hash;
+ u32 pg_num, pgp_num;
+ int pg_num_mask, pgp_num_mask;
+ s64 read_tier;
+ s64 write_tier; /* wins for read+write ops */
+ u64 flags;
+ char *name;
+};
+
+static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
+{
+ switch (pool->type) {
+ case CEPH_POOL_TYPE_REP:
+ return true;
+ case CEPH_POOL_TYPE_EC:
+ return false;
+ default:
+ BUG_ON(1);
+ }
+}
+
+struct ceph_object_locator {
+ s64 pool;
+};
+
+/*
+ * Maximum supported by kernel client object name length
+ *
+ * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
+ */
+#define CEPH_MAX_OID_NAME_LEN 100
+
+struct ceph_object_id {
+ char name[CEPH_MAX_OID_NAME_LEN];
+ int name_len;
+};
+
+struct ceph_pg_mapping {
+ struct rb_node node;
+ struct ceph_pg pgid;
+
+ union {
+ struct {
+ int len;
+ int osds[];
+ } pg_temp;
+ struct {
+ int osd;
+ } primary_temp;
+ };
+};
+
+struct ceph_osdmap {
+ struct ceph_fsid fsid;
+ u32 epoch;
+ u32 mkfs_epoch;
+ struct ceph_timespec created, modified;
+
+ u32 flags; /* CEPH_OSDMAP_* */
+
+ u32 max_osd; /* size of osd_state, _offload, _addr arrays */
+ u8 *osd_state; /* CEPH_OSD_* */
+ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
+ struct ceph_entity_addr *osd_addr;
+
+ struct rb_root pg_temp;
+ struct rb_root primary_temp;
+
+ u32 *osd_primary_affinity;
+
+ struct rb_root pg_pools;
+ u32 pool_max;
+
+ /* the CRUSH map specifies the mapping of placement groups to
+ * the list of osds that store+replicate them. */
+ struct crush_map *crush;
+
+ struct mutex crush_scratch_mutex;
+ int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+};
+
+static inline void ceph_oid_set_name(struct ceph_object_id *oid,
+ const char *name)
+{
+ int len;
+
+ len = strlen(name);
+ if (len > sizeof(oid->name)) {
+ WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
+ name, len, sizeof(oid->name));
+ len = sizeof(oid->name);
+ }
+
+ memcpy(oid->name, name, len);
+ oid->name_len = len;
+}
+
+static inline void ceph_oid_copy(struct ceph_object_id *dest,
+ struct ceph_object_id *src)
+{
+ BUG_ON(src->name_len > sizeof(dest->name));
+ memcpy(dest->name, src->name, src->name_len);
+ dest->name_len = src->name_len;
+}
+
+static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+{
+ return osd >= 0 && osd < map->max_osd &&
+ (map->osd_state[osd] & CEPH_OSD_EXISTS);
+}
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+ return ceph_osd_exists(map, osd) &&
+ (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+{
+ return !ceph_osd_is_up(map, osd);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+ return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+ int osd)
+{
+ if (osd >= map->max_osd)
+ return NULL;
+ return &map->osd_addr[osd];
+}
+
+static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
+{
+ __u8 version;
+
+ if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
+ pr_warn("incomplete pg encoding\n");
+ return -EINVAL;
+ }
+ version = ceph_decode_8(p);
+ if (version > 1) {
+ pr_warn("do not understand pg encoding %d > 1\n",
+ (int)version);
+ return -EINVAL;
+ }
+
+ pgid->pool = ceph_decode_64(p);
+ pgid->seed = ceph_decode_32(p);
+ *p += 4; /* skip deprecated preferred value */
+
+ return 0;
+}
+
+extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map,
+ struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+ u64 off, u64 len,
+ u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_locator *oloc,
+ struct ceph_object_id *oid,
+ struct ceph_pg *pg_out);
+
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
+ struct ceph_pg pgid,
+ int *osds, int *primary);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+ struct ceph_pg pgid);
+
+extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
+ u64 id);
+
+extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
+extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+
+#endif
diff --git a/kernel/include/linux/ceph/pagelist.h b/kernel/include/linux/ceph/pagelist.h
new file mode 100644
index 000000000..13d71fe18
--- /dev/null
+++ b/kernel/include/linux/ceph/pagelist.h
@@ -0,0 +1,80 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <asm/byteorder.h>
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct ceph_pagelist {
+ struct list_head head;
+ void *mapped_tail;
+ size_t length;
+ size_t room;
+ struct list_head free_list;
+ size_t num_pages_free;
+ atomic_t refcnt;
+};
+
+struct ceph_pagelist_cursor {
+ struct ceph_pagelist *pl; /* pagelist, for error checking */
+ struct list_head *page_lru; /* page in list */
+ size_t room; /* room remaining to reset to */
+};
+
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+ INIT_LIST_HEAD(&pl->head);
+ pl->mapped_tail = NULL;
+ pl->length = 0;
+ pl->room = 0;
+ INIT_LIST_HEAD(&pl->free_list);
+ pl->num_pages_free = 0;
+ atomic_set(&pl->refcnt, 1);
+}
+
+extern void ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
+
+extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
+
+extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
+
+extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c);
+
+extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+ __le64 ev = cpu_to_le64(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+ __le32 ev = cpu_to_le32(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+ __le16 ev = cpu_to_le16(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+ return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+ char *s, size_t len)
+{
+ int ret = ceph_pagelist_encode_32(pl, len);
+ if (ret)
+ return ret;
+ if (len)
+ return ceph_pagelist_append(pl, s, len);
+ return 0;
+}
+
+#endif
diff --git a/kernel/include/linux/ceph/rados.h b/kernel/include/linux/ceph/rados.h
new file mode 100644
index 000000000..2f822dca1
--- /dev/null
+++ b/kernel/include/linux/ceph/rados.h
@@ -0,0 +1,469 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include <linux/ceph/msgr.h>
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+ unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+ const struct ceph_fsid *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg_v1 {
+ __le16 preferred; /* preferred primary osd */
+ __le16 ps; /* placement seed */
+ __le32 pool; /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ * pg_num -- base number of pseudorandomly placed pgs
+ *
+ * pgp_num -- effective number when calculating pg placement. this
+ * is used for pg_num increases. new pgs result in data being "split"
+ * into new pgs. for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split. only _then_ are the new
+ * pgs placed independently.
+ *
+ * lpg_num -- localized pg count (per device). replicas are randomly
+ * selected.
+ *
+ * lpgp_num -- as above.
+ */
+#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */
+
+#define CEPH_POOL_TYPE_REP 1
+#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */
+#define CEPH_POOL_TYPE_EC 3
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit; /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le32 epoch;
+ __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS (1<<0)
+#define CEPH_OSD_UP (1<<1)
+#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
+#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
+
+extern const char *ceph_osd_state_name(int s);
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+/* osd primary-affinity. fixed point value: 0x10000 == baseline */
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
+#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
+
+/*
+ * osd ops
+ *
+ * WARNING: do not use these op codes directly. Use the helpers
+ * defined below instead. In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_RMW 0x3000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK 0x0100
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+#define CEPH_OSD_OP_TYPE_EXEC 0x0400
+#define CEPH_OSD_OP_TYPE_PG 0x0500
+#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
+
+#define __CEPH_OSD_OP1(mode, nr) \
+ (CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+ (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f) \
+ /** data **/ \
+ /* read */ \
+ f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \
+ f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \
+ f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \
+ \
+ /* fancy read */ \
+ f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \
+ f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \
+ \
+ f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \
+ f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \
+ \
+ /* versioning */ \
+ f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \
+ \
+ f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \
+ \
+ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \
+ \
+ /* sync */ \
+ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \
+ \
+ /* write */ \
+ f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \
+ f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \
+ f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \
+ f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \
+ f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \
+ \
+ /* fancy write */ \
+ f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \
+ f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \
+ f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \
+ f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \
+ \
+ f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \
+ f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \
+ f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \
+ \
+ f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \
+ f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \
+ \
+ f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \
+ \
+ /* omap */ \
+ f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \
+ f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \
+ f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \
+ f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+ f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \
+ f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \
+ f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \
+ f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \
+ f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \
+ \
+ /* tiering */ \
+ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \
+ f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \
+ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \
+ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \
+ f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \
+ f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \
+ f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \
+ f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+ \
+ /* convert tmap to omap */ \
+ f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \
+ \
+ /* hints */ \
+ f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \
+ \
+ /** multi **/ \
+ f(CLONERANGE, __CEPH_OSD_OP(WR, MULTI, 1), "clonerange") \
+ f(ASSERT_SRC_VERSION, __CEPH_OSD_OP(RD, MULTI, 2), "assert-src-version") \
+ f(SRC_CMPXATTR, __CEPH_OSD_OP(RD, MULTI, 3), "src-cmpxattr") \
+ \
+ /** attrs **/ \
+ /* read */ \
+ f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \
+ f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \
+ f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \
+ \
+ /* write */ \
+ f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \
+ f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \
+ f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \
+ f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \
+ \
+ /** subop **/ \
+ f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \
+ f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \
+ f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \
+ f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \
+ f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \
+ f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \
+ f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \
+ f(SCRUB_STOP, __CEPH_OSD_OP1(SUB, 8), "scrub-stop") \
+ f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \
+ \
+ /** lock **/ \
+ f(WRLOCK, __CEPH_OSD_OP(WR, LOCK, 1), "wrlock") \
+ f(WRUNLOCK, __CEPH_OSD_OP(WR, LOCK, 2), "wrunlock") \
+ f(RDLOCK, __CEPH_OSD_OP(WR, LOCK, 3), "rdlock") \
+ f(RDUNLOCK, __CEPH_OSD_OP(WR, LOCK, 4), "rdunlock") \
+ f(UPLOCK, __CEPH_OSD_OP(WR, LOCK, 5), "uplock") \
+ f(DNLOCK, __CEPH_OSD_OP(WR, LOCK, 6), "dnlock") \
+ \
+ /** exec **/ \
+ /* note: the RD bit here is wrong; see special-case below in helper */ \
+ f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \
+ \
+ /** pg **/ \
+ f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \
+ f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \
+ f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \
+ f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get")
+
+enum {
+#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+static inline int ceph_osd_op_type_multi(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE_RD) &&
+ op != CEPH_OSD_OP_CALL;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return op & CEPH_OSD_OP_MODE_WR;
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
+#define CEPH_OSD_TMAP_RM 'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
+
+extern const char *ceph_osd_op_name(int op);
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
+ CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
+ CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
+ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
+ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
+ CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */
+ CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
+ CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
+ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
+};
+
+enum {
+ CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
+ CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
+};
+
+#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/* xattr comparison */
+enum {
+ CEPH_OSD_CMPXATTR_OP_NOP = 0,
+ CEPH_OSD_CMPXATTR_OP_EQ = 1,
+ CEPH_OSD_CMPXATTR_OP_NE = 2,
+ CEPH_OSD_CMPXATTR_OP_GT = 3,
+ CEPH_OSD_CMPXATTR_OP_GTE = 4,
+ CEPH_OSD_CMPXATTR_OP_LT = 5,
+ CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+ CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+ CEPH_OSD_CMPXATTR_MODE_U64 = 2
+};
+
+#define RADOS_NOTIFY_VER 1
+
+/*
+ * an individual object operation. each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+ __le16 op; /* CEPH_OSD_OP_* */
+ __le32 flags; /* CEPH_OSD_OP_FLAG_* */
+ union {
+ struct {
+ __le64 offset, length;
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ } __attribute__ ((packed)) extent;
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ } __attribute__ ((packed)) xattr;
+ struct {
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ __le32 indata_len;
+ } __attribute__ ((packed)) cls;
+ struct {
+ __le64 cookie, count;
+ } __attribute__ ((packed)) pgls;
+ struct {
+ __le64 snapid;
+ } __attribute__ ((packed)) snap;
+ struct {
+ __le64 cookie;
+ __le64 ver;
+ __u8 flag; /* 0 = unwatch, 1 = watch */
+ } __attribute__ ((packed)) watch;
+ struct {
+ __le64 offset, length;
+ __le64 src_offset;
+ } __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 expected_object_size;
+ __le64 expected_write_size;
+ } __attribute__ ((packed)) alloc_hint;
+ };
+ __le32 payload_len;
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/kernel/include/linux/ceph/types.h b/kernel/include/linux/ceph/types.h
new file mode 100644
index 000000000..d3ff1cf2d
--- /dev/null
+++ b/kernel/include/linux/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/ceph_frag.h>
+#include <linux/ceph/ceph_hash.h>
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+ u64 ino;
+ u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+ int count;
+};
+
+
+#endif