summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorJohn W. Linville <linville@tuxdriver.com>2012-09-07 15:07:55 -0400
committerJohn W. Linville <linville@tuxdriver.com>2012-09-07 15:07:55 -0400
commitfac805f8c198092de9a2842efd7f5022e2937b18 (patch)
tree7557809c373f97a343c427d8fded0696060394ce /net
parent2461c7d60f9f3821274e4acf9019cba8b82c94b5 (diff)
parentf10723841e624c0726c70356b31d91befed01dd6 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless
Diffstat (limited to 'net')
-rw-r--r--net/bluetooth/hci_conn.c4
-rw-r--r--net/bluetooth/hci_event.c28
-rw-r--r--net/bluetooth/l2cap_core.c12
-rw-r--r--net/bluetooth/l2cap_sock.c4
-rw-r--r--net/bluetooth/sco.c19
-rw-r--r--net/bluetooth/smp.c15
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/ceph/ceph_common.c25
-rw-r--r--net/ceph/crush/mapper.c13
-rw-r--r--net/ceph/messenger.c925
-rw-r--r--net/ceph/mon_client.c76
-rw-r--r--net/ceph/msgpool.c7
-rw-r--r--net/ceph/osd_client.c77
-rw-r--r--net/ceph/osdmap.c59
-rw-r--r--net/core/dev.c60
-rw-r--r--net/core/filter.c8
-rw-r--r--net/core/rtnetlink.c1
-rw-r--r--net/core/skbuff.c124
-rw-r--r--net/core/sock.c60
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/route.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_cong.c3
-rw-r--r--net/ipv4/tcp_input.c21
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c33
-rw-r--r--net/ipv6/tcp_ipv6.c10
-rw-r--r--net/mac80211/cfg.c9
-rw-r--r--net/mac80211/mesh.c3
-rw-r--r--net/mac80211/mlme.c6
-rw-r--r--net/mac80211/scan.c3
-rw-r--r--net/mac80211/tx.c38
-rw-r--r--net/sctp/ulpevent.c3
-rw-r--r--net/sunrpc/Kconfig5
-rw-r--r--net/sunrpc/auth.c54
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c1
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c20
-rw-r--r--net/sunrpc/cache.c5
-rw-r--r--net/sunrpc/clnt.c12
-rw-r--r--net/sunrpc/rpcb_clnt.c4
-rw-r--r--net/sunrpc/sched.c14
-rw-r--r--net/sunrpc/xdr.c127
-rw-r--r--net/sunrpc/xprtrdma/transport.c3
-rw-r--r--net/sunrpc/xprtsock.c53
-rw-r--r--net/unix/af_unix.c93
-rw-r--r--net/wireless/core.c5
-rw-r--r--net/wireless/core.h1
-rw-r--r--net/wireless/nl80211.c4
-rw-r--r--net/wireless/reg.c19
-rw-r--r--net/wireless/util.c2
-rw-r--r--net/xfrm/xfrm_state.c21
52 files changed, 1403 insertions, 704 deletions
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 5ad7da217474..3c094e78dde9 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -29,6 +29,7 @@
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/a2mp.h>
+#include <net/bluetooth/smp.h>
static void hci_le_connect(struct hci_conn *conn)
{
@@ -619,6 +620,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
{
BT_DBG("hcon %p", conn);
+ if (conn->type == LE_LINK)
+ return smp_conn_security(conn, sec_level);
+
/* For sdp we don't need the link key. */
if (sec_level == BT_SECURITY_SDP)
return 1;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 32e21ad36a68..4fd2cf3bcd05 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1365,6 +1365,9 @@ static bool hci_resolve_next_name(struct hci_dev *hdev)
return false;
e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED);
+ if (!e)
+ return false;
+
if (hci_resolve_name(hdev, e) == 0) {
e->name_state = NAME_PENDING;
return true;
@@ -1393,12 +1396,20 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn,
return;
e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING);
- if (e) {
+ /* If the device was not found in a list of found devices names of which
+ * are pending. there is no need to continue resolving a next name as it
+ * will be done upon receiving another Remote Name Request Complete
+ * Event */
+ if (!e)
+ return;
+
+ list_del(&e->list);
+ if (name) {
e->name_state = NAME_KNOWN;
- list_del(&e->list);
- if (name)
- mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00,
- e->data.rssi, name, name_len);
+ mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00,
+ e->data.rssi, name, name_len);
+ } else {
+ e->name_state = NAME_NOT_KNOWN;
}
if (hci_resolve_next_name(hdev))
@@ -1749,7 +1760,12 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
if (conn->type == ACL_LINK) {
conn->state = BT_CONFIG;
hci_conn_hold(conn);
- conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+
+ if (!conn->out && !hci_conn_ssp_enabled(conn) &&
+ !hci_find_link_key(hdev, &ev->bdaddr))
+ conn->disc_timeout = HCI_PAIRING_TIMEOUT;
+ else
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
} else
conn->state = BT_CONNECTED;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 9f8b29ef5b68..e0abaf3cb6a5 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1198,6 +1198,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn)
sk = chan->sk;
hci_conn_hold(conn->hcon);
+ conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
bacpy(&bt_sk(sk)->src, conn->src);
bacpy(&bt_sk(sk)->dst, conn->dst);
@@ -1215,14 +1216,15 @@ clean:
static void l2cap_conn_ready(struct l2cap_conn *conn)
{
struct l2cap_chan *chan;
+ struct hci_conn *hcon = conn->hcon;
BT_DBG("conn %p", conn);
- if (!conn->hcon->out && conn->hcon->type == LE_LINK)
+ if (!hcon->out && hcon->type == LE_LINK)
l2cap_le_conn_ready(conn);
- if (conn->hcon->out && conn->hcon->type == LE_LINK)
- smp_conn_security(conn, conn->hcon->pending_sec_level);
+ if (hcon->out && hcon->type == LE_LINK)
+ smp_conn_security(hcon, hcon->pending_sec_level);
mutex_lock(&conn->chan_lock);
@@ -1235,8 +1237,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
continue;
}
- if (conn->hcon->type == LE_LINK) {
- if (smp_conn_security(conn, chan->sec_level))
+ if (hcon->type == LE_LINK) {
+ if (smp_conn_security(hcon, chan->sec_level))
l2cap_chan_ready(chan);
} else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 13f6a9816feb..2542abd3336f 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -619,7 +619,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
break;
}
- if (smp_conn_security(conn, sec.level))
+ if (smp_conn_security(conn->hcon, sec.level))
break;
sk->sk_state = BT_CONFIG;
chan->state = BT_CONFIG;
@@ -1180,7 +1180,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p
chan = l2cap_chan_create();
if (!chan) {
- l2cap_sock_kill(sk);
+ sk_free(sk);
return NULL;
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index caa109df6452..dc42b917aaaf 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -131,6 +131,15 @@ static int sco_conn_del(struct hci_conn *hcon, int err)
sco_sock_clear_timer(sk);
sco_chan_del(sk, err);
bh_unlock_sock(sk);
+
+ sco_conn_lock(conn);
+ conn->sk = NULL;
+ sco_pi(sk)->conn = NULL;
+ sco_conn_unlock(conn);
+
+ if (conn->hcon)
+ hci_conn_put(conn->hcon);
+
sco_sock_kill(sk);
}
@@ -821,16 +830,6 @@ static void sco_chan_del(struct sock *sk, int err)
BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
- if (conn) {
- sco_conn_lock(conn);
- conn->sk = NULL;
- sco_pi(sk)->conn = NULL;
- sco_conn_unlock(conn);
-
- if (conn->hcon)
- hci_conn_put(conn->hcon);
- }
-
sk->sk_state = BT_CLOSED;
sk->sk_err = err;
sk->sk_state_change(sk);
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 16ef0dc85a0a..8c225ef349cd 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -267,10 +267,10 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason, u8 send)
mgmt_auth_failed(conn->hcon->hdev, conn->dst, hcon->type,
hcon->dst_type, reason);
- if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) {
- cancel_delayed_work_sync(&conn->security_timer);
+ cancel_delayed_work_sync(&conn->security_timer);
+
+ if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags))
smp_chan_destroy(conn);
- }
}
#define JUST_WORKS 0x00
@@ -579,8 +579,11 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags))
smp = smp_chan_create(conn);
+ else
+ smp = conn->smp_chan;
- smp = conn->smp_chan;
+ if (!smp)
+ return SMP_UNSPECIFIED;
smp->preq[0] = SMP_CMD_PAIRING_REQ;
memcpy(&smp->preq[1], req, sizeof(*req));
@@ -757,9 +760,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
return 0;
}
-int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level)
+int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
{
- struct hci_conn *hcon = conn->hcon;
+ struct l2cap_conn *conn = hcon->l2cap_data;
struct smp_chan *smp = conn->smp_chan;
__u8 authreq;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 78f1cdad5b33..095259f83902 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
err = sk_filter(sk, skb);
if (err)
return err;
- if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) {
+ if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
set_rx_flow_off(cf_sk);
net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index ba4323bce0e9..69e38db28e5f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -17,6 +17,7 @@
#include <linux/string.h>
+#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/debugfs.h>
#include <linux/ceph/decode.h>
@@ -460,27 +461,23 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
client->auth_err = 0;
client->extra_mon_dispatch = NULL;
- client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
+ client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
supported_features;
- client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
+ client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
required_features;
/* msgr */
if (ceph_test_opt(client, MYIP))
myaddr = &client->options->my_addr;
- client->msgr = ceph_messenger_create(myaddr,
- client->supported_features,
- client->required_features);
- if (IS_ERR(client->msgr)) {
- err = PTR_ERR(client->msgr);
- goto fail;
- }
- client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+ ceph_messenger_init(&client->msgr, myaddr,
+ client->supported_features,
+ client->required_features,
+ ceph_test_opt(client, NOCRC));
/* subsystems */
err = ceph_monc_init(&client->monc, client);
if (err < 0)
- goto fail_msgr;
+ goto fail;
err = ceph_osdc_init(&client->osdc, client);
if (err < 0)
goto fail_monc;
@@ -489,8 +486,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
fail_monc:
ceph_monc_stop(&client->monc);
-fail_msgr:
- ceph_messenger_destroy(client->msgr);
fail:
kfree(client);
return ERR_PTR(err);
@@ -501,6 +496,8 @@ void ceph_destroy_client(struct ceph_client *client)
{
dout("destroy_client %p\n", client);
+ atomic_set(&client->msgr.stopping, 1);
+
/* unmount */
ceph_osdc_stop(&client->osdc);
@@ -508,8 +505,6 @@ void ceph_destroy_client(struct ceph_client *client)
ceph_debugfs_client_cleanup(client);
- ceph_messenger_destroy(client->msgr);
-
ceph_destroy_options(client->options);
kfree(client);
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index d7edc24333b8..35fce755ce10 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
int item = 0;
int itemtype;
int collide, reject;
- const unsigned int orig_tries = 5; /* attempts before we fall back to search */
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep);
@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
reject = 1;
goto reject;
}
- if (flocal >= (in->size>>1) &&
- flocal > orig_tries)
+ if (map->choose_local_fallback_tries > 0 &&
+ flocal >= (in->size>>1) &&
+ flocal > map->choose_local_fallback_tries)
item = bucket_perm_choose(in, x, r);
else
item = crush_bucket_choose(in, x, r);
@@ -422,13 +422,14 @@ reject:
ftotal++;
flocal++;
- if (collide && flocal < 3)
+ if (collide && flocal <= map->choose_local_tries)
/* retry locally a few times */
retry_bucket = 1;
- else if (flocal <= in->size + orig_tries)
+ else if (map->choose_local_fallback_tries > 0 &&
+ flocal <= in->size + map->choose_local_fallback_tries)
/* exhaustive bucket search */
retry_bucket = 1;
- else if (ftotal < 20)
+ else if (ftotal <= map->choose_total_tries)
/* then retry descent */
retry_descent = 1;
else
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 10255e81be79..b9796750034a 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -29,6 +29,74 @@
* the sender.
*/
+/*
+ * We track the state of the socket on a given connection using
+ * values defined below. The transition to a new socket state is
+ * handled by a function which verifies we aren't coming from an
+ * unexpected state.
+ *
+ * --------
+ * | NEW* | transient initial state
+ * --------
+ * | con_sock_state_init()
+ * v
+ * ----------
+ * | CLOSED | initialized, but no socket (and no
+ * ---------- TCP connection)
+ * ^ \
+ * | \ con_sock_state_connecting()
+ * | ----------------------
+ * | \
+ * + con_sock_state_closed() \
+ * |+--------------------------- \
+ * | \ \ \
+ * | ----------- \ \
+ * | | CLOSING | socket event; \ \
+ * | ----------- await close \ \
+ * | ^ \ |
+ * | | \ |
+ * | + con_sock_state_closing() \ |
+ * | / \ | |
+ * | / --------------- | |
+ * | / \ v v
+ * | / --------------
+ * | / -----------------| CONNECTING | socket created, TCP
+ * | | / -------------- connect initiated
+ * | | | con_sock_state_connected()
+ * | | v
+ * -------------
+ * | CONNECTED | TCP connection established
+ * -------------
+ *
+ * State values for ceph_connection->sock_state; NEW is assumed to be 0.
+ */
+
+#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */
+#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */
+#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */
+#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
+#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
+
+/*
+ * connection states
+ */
+#define CON_STATE_CLOSED 1 /* -> PREOPEN */
+#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
+#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
+#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
+#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
+#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
+
+/*
+ * ceph_connection flag bits
+ */
+#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
+ * messages on errors */
+#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
+#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
+#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
+#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
+
/* static tag bytes (protocol control messages) */
static char tag_msg = CEPH_MSGR_TAG_MSG;
static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -147,72 +215,130 @@ void ceph_msgr_flush(void)
}
EXPORT_SYMBOL(ceph_msgr_flush);
+/* Connection socket state transition functions */
+
+static void con_sock_state_init(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSED);
+}
+
+static void con_sock_state_connecting(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CONNECTING);
+}
+
+static void con_sock_state_connected(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CONNECTED);
+}
+
+static void con_sock_state_closing(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
+ old_state != CON_SOCK_STATE_CONNECTED &&
+ old_state != CON_SOCK_STATE_CLOSING))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSING);
+}
+
+static void con_sock_state_closed(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
+ old_state != CON_SOCK_STATE_CLOSING &&
+ old_state != CON_SOCK_STATE_CONNECTING &&
+ old_state != CON_SOCK_STATE_CLOSED))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSED);
+}
/*
* socket callback functions
*/
/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
+static void ceph_sock_data_ready(struct sock *sk, int count_unused)
{
struct ceph_connection *con = sk->sk_user_data;
+ if (atomic_read(&con->msgr->stopping)) {
+ return;
+ }
if (sk->sk_state != TCP_CLOSE_WAIT) {
- dout("ceph_data_ready on %p state = %lu, queueing work\n",
+ dout("%s on %p state = %lu, queueing work\n", __func__,
con, con->state);
queue_con(con);
}
}
/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
+static void ceph_sock_write_space(struct sock *sk)
{
struct ceph_connection *con = sk->sk_user_data;
/* only queue to workqueue if there is data we want to write,
* and there is sufficient space in the socket buffer to accept
- * more data. clear SOCK_NOSPACE so that ceph_write_space()
+ * more data. clear SOCK_NOSPACE so that ceph_sock_write_space()
* doesn't get called again until try_write() fills the socket
* buffer. See net/ipv4/tcp_input.c:tcp_check_space()
* and net/core/stream.c:sk_stream_write_space().
*/
- if (test_bit(WRITE_PENDING, &con->state)) {
+ if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) {
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
- dout("ceph_write_space %p queueing write work\n", con);
+ dout("%s %p queueing write work\n", __func__, con);
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
queue_con(con);
}
} else {
- dout("ceph_write_space %p nothing to write\n", con);
+ dout("%s %p nothing to write\n", __func__, con);
}
}
/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
+static void ceph_sock_state_change(struct sock *sk)
{
struct ceph_connection *con = sk->sk_user_data;
- dout("ceph_state_change %p state = %lu sk_state = %u\n",
+ dout("%s %p state = %lu sk_state = %u\n", __func__,
con, con->state, sk->sk_state);
- if (test_bit(CLOSED, &con->state))
- return;
-
switch (sk->sk_state) {
case TCP_CLOSE:
- dout("ceph_state_change TCP_CLOSE\n");
+ dout("%s TCP_CLOSE\n", __func__);
case TCP_CLOSE_WAIT:
- dout("ceph_state_change TCP_CLOSE_WAIT\n");
- if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
- if (test_bit(CONNECTING, &con->state))
- con->error_msg = "connection failed";
- else
- con->error_msg = "socket closed";
- queue_con(con);
- }
+ dout("%s TCP_CLOSE_WAIT\n", __func__);
+ con_sock_state_closing(con);
+ set_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
+ queue_con(con);
break;
case TCP_ESTABLISHED:
- dout("ceph_state_change TCP_ESTABLISHED\n");
+ dout("%s TCP_ESTABLISHED\n", __func__);
+ con_sock_state_connected(con);
queue_con(con);
break;
default: /* Everything else is uninteresting */
@@ -228,9 +354,9 @@ static void set_sock_callbacks(struct socket *sock,
{
struct sock *sk = sock->sk;
sk->sk_user_data = con;
- sk->sk_data_ready = ceph_data_ready;
- sk->sk_write_space = ceph_write_space;
- sk->sk_state_change = ceph_state_change;
+ sk->sk_data_ready = ceph_sock_data_ready;
+ sk->sk_write_space = ceph_sock_write_space;
+ sk->sk_state_change = ceph_sock_state_change;
}
@@ -262,6 +388,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+ con_sock_state_connecting(con);
ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
O_NONBLOCK);
if (ret == -EINPROGRESS) {
@@ -277,7 +404,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
return ret;
}
con->sock = sock;
-
return 0;
}
@@ -333,16 +459,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
*/
static int con_close_socket(struct ceph_connection *con)
{
- int rc;
+ int rc = 0;
dout("con_close_socket on %p sock %p\n", con, con->sock);
- if (!con->sock)
- return 0;
- set_bit(SOCK_CLOSED, &con->state);
- rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
- sock_release(con->sock);
- con->sock = NULL;
- clear_bit(SOCK_CLOSED, &con->state);
+ if (con->sock) {
+ rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+ sock_release(con->sock);
+ con->sock = NULL;
+ }
+
+ /*
+ * Forcibly clear the SOCK_CLOSED flag. It gets set
+ * independent of the connection mutex, and we could have
+ * received a socket close event before we had the chance to
+ * shut the socket down.
+ */
+ clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
+
+ con_sock_state_closed(con);
return rc;
}
@@ -353,6 +487,10 @@ static int con_close_socket(struct ceph_connection *con)
static void ceph_msg_remove(struct ceph_msg *msg)
{
list_del_init(&msg->list_head);
+ BUG_ON(msg->con == NULL);
+ msg->con->ops->put(msg->con);
+ msg->con = NULL;
+
ceph_msg_put(msg);
}
static void ceph_msg_remove_list(struct list_head *head)
@@ -372,8 +510,11 @@ static void reset_connection(struct ceph_connection *con)
ceph_msg_remove_list(&con->out_sent);
if (con->in_msg) {
+ BUG_ON(con->in_msg->con != con);
+ con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
+ con->ops->put(con);
}
con->connect_seq = 0;
@@ -391,32 +532,44 @@ static void reset_connection(struct ceph_connection *con)
*/
void ceph_con_close(struct ceph_connection *con)
{
+ mutex_lock(&con->mutex);
dout("con_close %p peer %s\n", con,
ceph_pr_addr(&con->peer_addr.in_addr));
- set_bit(CLOSED, &con->state); /* in case there's queued work */
- clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
- clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
- clear_bit(KEEPALIVE_PENDING, &con->state);
- clear_bit(WRITE_PENDING, &con->state);
- mutex_lock(&con->mutex);
+ con->state = CON_STATE_CLOSED;
+
+ clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */
+ clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
+ clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+ clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
+ clear_bit(CON_FLAG_BACKOFF, &con->flags);
+
reset_connection(con);
con->peer_global_seq = 0;
cancel_delayed_work(&con->work);
+ con_close_socket(con);
mutex_unlock(&con->mutex);
- queue_con(con);
}
EXPORT_SYMBOL(ceph_con_close);
/*
* Reopen a closed connection, with a new peer address.
*/
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+void ceph_con_open(struct ceph_connection *con,
+ __u8 entity_type, __u64 entity_num,
+ struct ceph_entity_addr *addr)
{
+ mutex_lock(&con->mutex);
dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
- set_bit(OPENING, &con->state);
- clear_bit(CLOSED, &con->state);
+
+ BUG_ON(con->state != CON_STATE_CLOSED);
+ con->state = CON_STATE_PREOPEN;
+
+ con->peer_name.type = (__u8) entity_type;
+ con->peer_name.num = cpu_to_le64(entity_num);
+
memcpy(&con->peer_addr, addr, sizeof(*addr));
con->delay = 0; /* reset backoff memory */
+ mutex_unlock(&con->mutex);
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_open);
@@ -430,42 +583,26 @@ bool ceph_con_opened(struct ceph_connection *con)
}
/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
- int nref = __atomic_add_unless(&con->nref, 1, 0);
-
- dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1);
-
- return nref ? con : NULL;
-}
-
-void ceph_con_put(struct ceph_connection *con)
-{
- int nref = atomic_dec_return(&con->nref);
-
- BUG_ON(nref < 0);
- if (nref == 0) {
- BUG_ON(con->sock);
- kfree(con);
- }
- dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref);
-}
-
-/*
* initialize a new connection.
*/
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+void ceph_con_init(struct ceph_connection *con, void *private,
+ const struct ceph_connection_operations *ops,
+ struct ceph_messenger *msgr)
{
dout("con_init %p\n", con);
memset(con, 0, sizeof(*con));
- atomic_set(&con->nref, 1);
+ con->private = private;
+ con->ops = ops;
con->msgr = msgr;
+
+ con_sock_state_init(con);
+
mutex_init(&con->mutex);
INIT_LIST_HEAD(&con->out_queue);
INIT_LIST_HEAD(&con->out_sent);
INIT_DELAYED_WORK(&con->work, con_work);
+
+ con->state = CON_STATE_CLOSED;
}
EXPORT_SYMBOL(ceph_con_init);
@@ -486,14 +623,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
return ret;
}
-static void ceph_con_out_kvec_reset(struct ceph_connection *con)
+static void con_out_kvec_reset(struct ceph_connection *con)
{
con->out_kvec_left = 0;
con->out_kvec_bytes = 0;
con->out_kvec_cur = &con->out_kvec[0];
}
-static void ceph_con_out_kvec_add(struct ceph_connection *con,
+static void con_out_kvec_add(struct ceph_connection *con,
size_t size, void *data)
{
int index;
@@ -507,6 +644,53 @@ static void ceph_con_out_kvec_add(struct ceph_connection *con,
con->out_kvec_bytes += size;
}
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+ if (!bio) {
+ *iter = NULL;
+ *seg = 0;
+ return;
+ }
+ *iter = bio;
+ *seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+ if (*bio_iter == NULL)
+ return;
+
+ BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+ (*seg)++;
+ if (*seg == (*bio_iter)->bi_vcnt)
+ init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
+static void prepare_write_message_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+
+ BUG_ON(!msg);
+ BUG_ON(!msg->hdr.data_len);
+
+ /* initialize page iterator */
+ con->out_msg_pos.page = 0;
+ if (msg->pages)
+ con->out_msg_pos.page_pos = msg->page_alignment;
+ else
+ con->out_msg_pos.page_pos = 0;
+#ifdef CONFIG_BLOCK
+ if (msg->bio)
+ init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+ con->out_msg_pos.data_pos = 0;
+ con->out_msg_pos.did_page_crc = false;
+ con->out_more = 1; /* data + footer will follow */
+}
+
/*
* Prepare footer for currently outgoing message, and finish things
* off. Assumes out_kvec* are already valid.. we just add on to the end.
@@ -516,6 +700,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
struct ceph_msg *m = con->out_msg;
int v = con->out_kvec_left;
+ m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
dout("prepare_write_message_footer %p\n", con);
con->out_kvec_is_msg = true;
con->out_kvec[v].iov_base = &m->footer;
@@ -534,7 +720,7 @@ static void prepare_write_message(struct ceph_connection *con)
struct ceph_msg *m;
u32 crc;
- ceph_con_out_kvec_reset(con);
+ con_out_kvec_reset(con);
con->out_kvec_is_msg = true;
con->out_msg_done = false;
@@ -542,14 +728,16 @@ static void prepare_write_message(struct ceph_connection *con)
* TCP packet that's a good thing. */
if (con->in_seq > con->in_seq_acked) {
con->in_seq_acked = con->in_seq;
- ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
+ con_out_kvec_add(con, sizeof (con->out_temp_ack),
&con->out_temp_ack);
}
+ BUG_ON(list_empty(&con->out_queue));
m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
con->out_msg = m;
+ BUG_ON(m->con != con);
/* put message on sent list */
ceph_msg_get(m);
@@ -576,18 +764,18 @@ static void prepare_write_message(struct ceph_connection *con)
BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
/* tag + hdr + front + middle */
- ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
- ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
- ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+ con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+ con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
if (m->middle)
- ceph_con_out_kvec_add(con, m->middle->vec.iov_len,
+ con_out_kvec_add(con, m->middle->vec.iov_len,
m->middle->vec.iov_base);
/* fill in crc (except data pages), footer */
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
con->out_msg->hdr.crc = cpu_to_le32(crc);
- con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+ con->out_msg->footer.flags = 0;
crc = crc32c(0, m->front.iov_base, m->front.iov_len);
con->out_msg->footer.front_crc = cpu_to_le32(crc);
@@ -597,28 +785,19 @@ static void prepare_write_message(struct ceph_connection *con)
con->out_msg->footer.middle_crc = cpu_to_le32(crc);
} else
con->out_msg->footer.middle_crc = 0;
- con->out_msg->footer.data_crc = 0;
- dout("prepare_write_message front_crc %u data_crc %u\n",
+ dout("%s front_crc %u middle_crc %u\n", __func__,
le32_to_cpu(con->out_msg->footer.front_crc),
le32_to_cpu(con->out_msg->footer.middle_crc));
/* is there a data payload? */
- if (le32_to_cpu(m->hdr.data_len) > 0) {
- /* initialize page iterator */
- con->out_msg_pos.page = 0;
- if (m->pages)
- con->out_msg_pos.page_pos = m->page_alignment;
- else
- con->out_msg_pos.page_pos = 0;
- con->out_msg_pos.data_pos = 0;
- con->out_msg_pos.did_page_crc = false;
- con->out_more = 1; /* data + footer will follow */
- } else {
+ con->out_msg->footer.data_crc = 0;
+ if (m->hdr.data_len)
+ prepare_write_message_data(con);
+ else
/* no, queue up footer too and be done */
prepare_write_message_footer(con);
- }
- set_bit(WRITE_PENDING, &con->state);
+ set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
}
/*
@@ -630,16 +809,16 @@ static void prepare_write_ack(struct ceph_connection *con)
con->in_seq_acked, con->in_seq);
con->in_seq_acked = con->in_seq;
- ceph_con_out_kvec_reset(con);
+ con_out_kvec_reset(con);
- ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
+ con_out_kvec_add(con, sizeof (con->out_temp_ack),
&con->out_temp_ack);
con->out_more = 1; /* more will follow.. eventually.. */
- set_bit(WRITE_PENDING, &con->state);
+ set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
}
/*
@@ -648,9 +827,9 @@ static void prepare_write_ack(struct ceph_connection *con)
static void prepare_write_keepalive(struct ceph_connection *con)
{
dout("prepare_write_keepalive %p\n", con);
- ceph_con_out_kvec_reset(con);
- ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
- set_bit(WRITE_PENDING, &con->state);
+ con_out_kvec_reset(con);
+ con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+ set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
}
/*
@@ -665,27 +844,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
if (!con->ops->get_authorizer) {
con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
con->out_connect.authorizer_len = 0;
-
return NULL;
}
/* Can't hold the mutex while getting authorizer */
-
mutex_unlock(&con->mutex);
-
auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
-
mutex_lock(&con->mutex);
if (IS_ERR(auth))
return auth;
- if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
+ if (con->state != CON_STATE_NEGOTIATING)
return ERR_PTR(-EAGAIN);
con->auth_reply_buf = auth->authorizer_reply_buf;
con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
-
-
return auth;
}
@@ -694,12 +867,12 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
*/
static void prepare_write_banner(struct ceph_connection *con)
{
- ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
- ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+ con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+ con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
&con->msgr->my_enc_addr);
con->out_more = 0;
- set_bit(WRITE_PENDING, &con->state);
+ set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
}
static int prepare_write_connect(struct ceph_connection *con)
@@ -742,14 +915,15 @@ static int prepare_write_connect(struct ceph_connection *con)
con->out_connect.authorizer_len = auth ?
cpu_to_le32(auth->authorizer_buf_len) : 0;
- ceph_con_out_kvec_add(con, sizeof (con->out_connect),
+ con_out_kvec_reset(con);
+ con_out_kvec_add(con, sizeof (con->out_connect),
&con->out_connect);
if (auth && auth->authorizer_buf_len)
- ceph_con_out_kvec_add(con, auth->authorizer_buf_len,
+ con_out_kvec_add(con, auth->authorizer_buf_len,
auth->authorizer_buf);
con->out_more = 0;
- set_bit(WRITE_PENDING, &con->state);
+ set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
return 0;
}
@@ -797,30 +971,34 @@ out:
return ret; /* done! */
}
-#ifdef CONFIG_BLOCK
-static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
+ size_t len, size_t sent, bool in_trail)
{
- if (!bio) {
- *iter = NULL;
- *seg = 0;
- return;
- }
- *iter = bio;
- *seg = bio->bi_idx;
-}
+ struct ceph_msg *msg = con->out_msg;
-static void iter_bio_next(struct bio **bio_iter, int *seg)
-{
- if (*bio_iter == NULL)
- return;
+ BUG_ON(!msg);
+ BUG_ON(!sent);
- BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+ con->out_msg_pos.data_pos += sent;
+ con->out_msg_pos.page_pos += sent;
+ if (sent < len)
+ return;
- (*seg)++;
- if (*seg == (*bio_iter)->bi_vcnt)
- init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
-}
+ BUG_ON(sent != len);
+ con->out_msg_pos.page_pos = 0;
+ con->out_msg_pos.page++;
+ con->out_msg_pos.did_page_crc = false;
+ if (in_trail)
+ list_move_tail(&page->lru,
+ &msg->trail->head);
+ else if (msg->pagelist)
+ list_move_tail(&page->lru,
+ &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+ else if (msg->bio)
+ iter_bio_next(&msg->bio_iter, &msg->bio_seg);
#endif
+}
/*
* Write as much message data payload as we can. If we finish, queue
@@ -837,41 +1015,36 @@ static int write_partial_msg_pages(struct ceph_connection *con)
bool do_datacrc = !con->msgr->nocrc;
int ret;
int total_max_write;
- int in_trail = 0;
- size_t trail_len = (msg->trail ? msg->trail->length : 0);
+ bool in_trail = false;
+ const size_t trail_len = (msg->trail ? msg->trail->length : 0);
+ const size_t trail_off = data_len - trail_len;
dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
- con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+ con, msg, con->out_msg_pos.page, msg->nr_pages,
con->out_msg_pos.page_pos);
-#ifdef CONFIG_BLOCK
- if (msg->bio && !msg->bio_iter)
- init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
-#endif
-
+ /*
+ * Iterate through each page that contains data to be
+ * written, and send as much as possible for each.
+ *
+ * If we are calculating the data crc (the default), we will
+ * need to map the page. If we have no pages, they have
+ * been revoked, so use the zero page.
+ */
while (data_len > con->out_msg_pos.data_pos) {
struct page *page = NULL;
int max_write = PAGE_SIZE;
int bio_offset = 0;
- total_max_write = data_len - trail_len -
- con->out_msg_pos.data_pos;
-
- /*
- * if we are calculating the data crc (the default), we need
- * to map the page. if our pages[] has been revoked, use the
- * zero page.
- */
-
- /* have we reached the trail part of the data? */
- if (con->out_msg_pos.data_pos >= data_len - trail_len) {
- in_trail = 1;
+ in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off;
+ if (!in_trail)
+ total_max_write = trail_off - con->out_msg_pos.data_pos;
+ if (in_trail) {
total_max_write = data_len - con->out_msg_pos.data_pos;
page = list_first_entry(&msg->trail->head,
struct page, lru);
- max_write = PAGE_SIZE;
} else if (msg->pages) {
page = msg->pages[con->out_msg_pos.page];
} else if (msg->pagelist) {
@@ -894,15 +1067,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
if (do_datacrc && !con->out_msg_pos.did_page_crc) {
void *base;
- u32 crc;
- u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+ u32 crc = le32_to_cpu(msg->footer.data_crc);
char *kaddr;
kaddr = kmap(page);
BUG_ON(kaddr == NULL);
base = kaddr + con->out_msg_pos.page_pos + bio_offset;
- crc = crc32c(tmpcrc, base, len);
- con->out_msg->footer.data_crc = cpu_to_le32(crc);
+ crc = crc32c(crc, base, len);
+ msg->footer.data_crc = cpu_to_le32(crc);
con->out_msg_pos.did_page_crc = true;
}
ret = ceph_tcp_sendpage(con->sock, page,
@@ -915,31 +1087,15 @@ static int write_partial_msg_pages(struct ceph_connection *con)
if (ret <= 0)
goto out;
- con->out_msg_pos.data_pos += ret;
- con->out_msg_pos.page_pos += ret;
- if (ret == len) {
- con->out_msg_pos.page_pos = 0;
- con->out_msg_pos.page++;
- con->out_msg_pos.did_page_crc = false;
- if (in_trail)
- list_move_tail(&page->lru,
- &msg->trail->head);
- else if (msg->pagelist)
- list_move_tail(&page->lru,
- &msg->pagelist->head);
-#ifdef CONFIG_BLOCK
- else if (msg->bio)
- iter_bio_next(&msg->bio_iter, &msg->bio_seg);
-#endif
- }
+ out_msg_pos_next(con, page, len, (size_t) ret, in_trail);
}
dout("write_partial_msg_pages %p msg %p done\n", con, msg);
/* prepare and queue up footer, too */
if (!do_datacrc)
- con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
- ceph_con_out_kvec_reset(con);
+ msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con_out_kvec_reset(con);
prepare_write_message_footer(con);
ret = 1;
out:
@@ -1351,20 +1507,14 @@ static int process_banner(struct ceph_connection *con)
ceph_pr_addr(&con->msgr->inst.addr.in_addr));
}
- set_bit(NEGOTIATING, &con->state);
- prepare_read_connect(con);
return 0;
}
static void fail_protocol(struct ceph_connection *con)
{
reset_connection(con);
- set_bit(CLOSED, &con->state); /* in case there's queued work */
-
- mutex_unlock(&con->mutex);
- if (con->ops->bad_proto)
- con->ops->bad_proto(con);
- mutex_lock(&con->mutex);
+ BUG_ON(con->state != CON_STATE_NEGOTIATING);
+ con->state = CON_STATE_CLOSED;
}
static int process_connect(struct ceph_connection *con)
@@ -1407,7 +1557,6 @@ static int process_connect(struct ceph_connection *con)
return -1;
}
con->auth_retry = 1;
- ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con);
if (ret < 0)
return ret;
@@ -1428,7 +1577,6 @@ static int process_connect(struct ceph_connection *con)
ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr.in_addr));
reset_connection(con);
- ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con);
if (ret < 0)
return ret;
@@ -1440,8 +1588,7 @@ static int process_connect(struct ceph_connection *con)
if (con->ops->peer_reset)
con->ops->peer_reset(con);
mutex_lock(&con->mutex);
- if (test_bit(CLOSED, &con->state) ||
- test_bit(OPENING, &con->state))
+ if (con->state != CON_STATE_NEGOTIATING)
return -EAGAIN;
break;
@@ -1454,7 +1601,6 @@ static int process_connect(struct ceph_connection *con)
le32_to_cpu(con->out_connect.connect_seq),
le32_to_cpu(con->in_reply.connect_seq));
con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
- ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con);
if (ret < 0)
return ret;
@@ -1471,7 +1617,6 @@ static int process_connect(struct ceph_connection *con)
le32_to_cpu(con->in_reply.global_seq));
get_global_seq(con->msgr,
le32_to_cpu(con->in_reply.global_seq));
- ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con);
if (ret < 0)
return ret;
@@ -1489,7 +1634,10 @@ static int process_connect(struct ceph_connection *con)
fail_protocol(con);
return -1;
}
- clear_bit(CONNECTING, &con->state);
+
+ BUG_ON(con->state != CON_STATE_NEGOTIATING);
+ con->state = CON_STATE_OPEN;
+
con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
con->connect_seq++;
con->peer_features = server_feat;
@@ -1501,7 +1649,9 @@ static int process_connect(struct ceph_connection *con)
le32_to_cpu(con->in_reply.connect_seq));
if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
- set_bit(LOSSYTX, &con->state);
+ set_bit(CON_FLAG_LOSSYTX, &con->flags);
+
+ con->delay = 0; /* reset backoff memory */
prepare_read_tag(con);
break;
@@ -1587,10 +1737,7 @@ static int read_partial_message_section(struct ceph_connection *con,
return 1;
}
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip);
-
+static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
static int read_partial_message_pages(struct ceph_connection *con,
struct page **pages,
@@ -1633,9 +1780,6 @@ static int read_partial_message_bio(struct ceph_connection *con,
void *p;
int ret, left;
- if (IS_ERR(bv))
- return PTR_ERR(bv);
-
left = min((int)(data_len - con->in_msg_pos.data_pos),
(int)(bv->bv_len - con->in_msg_pos.page_pos));
@@ -1672,7 +1816,6 @@ static int read_partial_message(struct ceph_connection *con)
int ret;
unsigned int front_len, middle_len, data_len;
bool do_datacrc = !con->msgr->nocrc;
- int skip;
u64 seq;
u32 crc;
@@ -1723,10 +1866,13 @@ static int read_partial_message(struct ceph_connection *con)
/* allocate message? */
if (!con->in_msg) {
+ int skip = 0;
+
dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
con->in_hdr.front_len, con->in_hdr.data_len);
- skip = 0;
- con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+ ret = ceph_con_in_msg_alloc(con, &skip);
+ if (ret < 0)
+ return ret;
if (skip) {
/* skip this message */
dout("alloc_msg said skip message\n");
@@ -1737,11 +1883,9 @@ static int read_partial_message(struct ceph_connection *con)
con->in_seq++;
return 0;
}
- if (!con->in_msg) {
- con->error_msg =
- "error allocating memory for incoming message";
- return -ENOMEM;
- }
+
+ BUG_ON(!con->in_msg);
+ BUG_ON(con->in_msg->con != con);
m = con->in_msg;
m->front.iov_len = 0; /* haven't read it yet */
if (m->middle)
@@ -1753,6 +1897,11 @@ static int read_partial_message(struct ceph_connection *con)
else
con->in_msg_pos.page_pos = 0;
con->in_msg_pos.data_pos = 0;
+
+#ifdef CONFIG_BLOCK
+ if (m->bio)
+ init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
}
/* front */
@@ -1769,10 +1918,6 @@ static int read_partial_message(struct ceph_connection *con)
if (ret <= 0)
return ret;
}
-#ifdef CONFIG_BLOCK
- if (m->bio && !m->bio_iter)
- init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
-#endif
/* (page) data */
while (con->in_msg_pos.data_pos < data_len) {
@@ -1783,7 +1928,7 @@ static int read_partial_message(struct ceph_connection *con)
return ret;
#ifdef CONFIG_BLOCK
} else if (m->bio) {
-
+ BUG_ON(!m->bio_iter);
ret = read_partial_message_bio(con,
&m->bio_iter, &m->bio_seg,
data_len, do_datacrc);
@@ -1837,8 +1982,11 @@ static void process_message(struct ceph_connection *con)
{
struct ceph_msg *msg;
+ BUG_ON(con->in_msg->con != con);
+ con->in_msg->con = NULL;
msg = con->in_msg;
con->in_msg = NULL;
+ con->ops->put(con);
/* if first message, set peer_name */
if (con->peer_name.type == 0)
@@ -1858,7 +2006,6 @@ static void process_message(struct ceph_connection *con)
con->ops->dispatch(con, msg);
mutex_lock(&con->mutex);
- prepare_read_tag(con);
}
@@ -1870,22 +2017,19 @@ static int try_write(struct ceph_connection *con)
{
int ret = 1;
- dout("try_write start %p state %lu nref %d\n", con, con->state,
- atomic_read(&con->nref));
+ dout("try_write start %p state %lu\n", con, con->state);
more:
dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
/* open the socket first? */
- if (con->sock == NULL) {
- ceph_con_out_kvec_reset(con);
+ if (con->state == CON_STATE_PREOPEN) {
+ BUG_ON(con->sock);
+ con->state = CON_STATE_CONNECTING;
+
+ con_out_kvec_reset(con);
prepare_write_banner(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- goto out;
prepare_read_banner(con);
- set_bit(CONNECTING, &con->state);
- clear_bit(NEGOTIATING, &con->state);
BUG_ON(con->in_msg);
con->in_tag = CEPH_MSGR_TAG_READY;
@@ -1932,7 +2076,7 @@ more_kvec:
}
do_next:
- if (!test_bit(CONNECTING, &con->state)) {
+ if (con->state == CON_STATE_OPEN) {
/* is anything else pending? */
if (!list_empty(&con->out_queue)) {
prepare_write_message(con);
@@ -1942,14 +2086,15 @@ do_next:
prepare_write_ack(con);
goto more;
}
- if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+ if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING,
+ &con->flags)) {
prepare_write_keepalive(con);
goto more;
}
}
/* Nothing to do! */
- clear_bit(WRITE_PENDING, &con->state);
+ clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
dout("try_write nothing else to write.\n");
ret = 0;
out:
@@ -1966,38 +2111,42 @@ static int try_read(struct ceph_connection *con)
{
int ret = -1;
- if (!con->sock)
- return 0;
-
- if (test_bit(STANDBY, &con->state))
+more:
+ dout("try_read start on %p state %lu\n", con, con->state);
+ if (con->state != CON_STATE_CONNECTING &&
+ con->state != CON_STATE_NEGOTIATING &&
+ con->state != CON_STATE_OPEN)
return 0;
- dout("try_read start on %p\n", con);
+ BUG_ON(!con->sock);
-more:
dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
con->in_base_pos);
- /*
- * process_connect and process_message drop and re-take
- * con->mutex. make sure we handle a racing close or reopen.
- */
- if (test_bit(CLOSED, &con->state) ||
- test_bit(OPENING, &con->state)) {
- ret = -EAGAIN;
+ if (con->state == CON_STATE_CONNECTING) {
+ dout("try_read connecting\n");
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_banner(con);
+ if (ret < 0)
+ goto out;
+
+ BUG_ON(con->state != CON_STATE_CONNECTING);
+ con->state = CON_STATE_NEGOTIATING;
+
+ /* Banner is good, exchange connection info */
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ goto out;
+ prepare_read_connect(con);
+
+ /* Send connection info before awaiting response */
goto out;
}
- if (test_bit(CONNECTING, &con->state)) {
- if (!test_bit(NEGOTIATING, &con->state)) {
- dout("try_read connecting\n");
- ret = read_partial_banner(con);
- if (ret <= 0)
- goto out;
- ret = process_banner(con);
- if (ret < 0)
- goto out;
- }
+ if (con->state == CON_STATE_NEGOTIATING) {
+ dout("try_read negotiating\n");
ret = read_partial_connect(con);
if (ret <= 0)
goto out;
@@ -2007,6 +2156,8 @@ more:
goto more;
}
+ BUG_ON(con->state != CON_STATE_OPEN);
+
if (con->in_base_pos < 0) {
/*
* skipping + discarding content.
@@ -2040,7 +2191,8 @@ more:
prepare_read_ack(con);
break;
case CEPH_MSGR_TAG_CLOSE:
- set_bit(CLOSED, &con->state); /* fixme */
+ con_close_socket(con);
+ con->state = CON_STATE_CLOSED;
goto out;
default:
goto bad_tag;
@@ -2063,6 +2215,8 @@ more:
if (con->in_tag == CEPH_MSGR_TAG_READY)
goto more;
process_message(con);
+ if (con->state == CON_STATE_OPEN)
+ prepare_read_tag(con);
goto more;
}
if (con->in_tag == CEPH_MSGR_TAG_ACK) {
@@ -2091,12 +2245,6 @@ bad_tag:
*/
static void queue_con(struct ceph_connection *con)
{
- if (test_bit(DEAD, &con->state)) {
- dout("queue_con %p ignoring: DEAD\n",
- con);
- return;
- }
-
if (!con->ops->get(con)) {
dout("queue_con %p ref count 0\n", con);
return;
@@ -2121,7 +2269,26 @@ static void con_work(struct work_struct *work)
mutex_lock(&con->mutex);
restart:
- if (test_and_clear_bit(BACKOFF, &con->state)) {
+ if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
+ switch (con->state) {
+ case CON_STATE_CONNECTING:
+ con->error_msg = "connection failed";
+ break;
+ case CON_STATE_NEGOTIATING:
+ con->error_msg = "negotiation failed";
+ break;
+ case CON_STATE_OPEN:
+ con->error_msg = "socket closed";
+ break;
+ default:
+ dout("unrecognized con state %d\n", (int)con->state);
+ con->error_msg = "unrecognized con state";
+ BUG();
+ }
+ goto fault;
+ }
+
+ if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
dout("con_work %p backing off\n", con);
if (queue_delayed_work(ceph_msgr_wq, &con->work,
round_jiffies_relative(con->delay))) {
@@ -2135,35 +2302,35 @@ restart:
}
}
- if (test_bit(STANDBY, &con->state)) {
+ if (con->state == CON_STATE_STANDBY) {
dout("con_work %p STANDBY\n", con);
goto done;
}
- if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
- dout("con_work CLOSED\n");
- con_close_socket(con);
+ if (con->state == CON_STATE_CLOSED) {
+ dout("con_work %p CLOSED\n", con);
+ BUG_ON(con->sock);
goto done;
}
- if (test_and_clear_bit(OPENING, &con->state)) {
- /* reopen w/ new peer */
+ if (con->state == CON_STATE_PREOPEN) {
dout("con_work OPENING\n");
- con_close_socket(con);
+ BUG_ON(con->sock);
}
- if (test_and_clear_bit(SOCK_CLOSED, &con->state))
- goto fault;
-
ret = try_read(con);
if (ret == -EAGAIN)
goto restart;
- if (ret < 0)
+ if (ret < 0) {
+ con->error_msg = "socket error on read";
goto fault;
+ }
ret = try_write(con);
if (ret == -EAGAIN)
goto restart;
- if (ret < 0)
+ if (ret < 0) {
+ con->error_msg = "socket error on write";
goto fault;
+ }
done:
mutex_unlock(&con->mutex);
@@ -2172,7 +2339,6 @@ done_unlocked:
return;
fault:
- mutex_unlock(&con->mutex);
ceph_fault(con); /* error/fault path */
goto done_unlocked;
}
@@ -2183,26 +2349,31 @@ fault:
* exponential backoff
*/
static void ceph_fault(struct ceph_connection *con)
+ __releases(con->mutex)
{
pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
dout("fault %p state %lu to peer %s\n",
con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
- if (test_bit(LOSSYTX, &con->state)) {
- dout("fault on LOSSYTX channel\n");
- goto out;
- }
-
- mutex_lock(&con->mutex);
- if (test_bit(CLOSED, &con->state))
- goto out_unlock;
+ BUG_ON(con->state != CON_STATE_CONNECTING &&
+ con->state != CON_STATE_NEGOTIATING &&
+ con->state != CON_STATE_OPEN);
con_close_socket(con);
+ if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) {
+ dout("fault on LOSSYTX channel, marking CLOSED\n");
+ con->state = CON_STATE_CLOSED;
+ goto out_unlock;
+ }
+
if (con->in_msg) {
+ BUG_ON(con->in_msg->con != con);
+ con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
+ con->ops->put(con);
}
/* Requeue anything that hasn't been acked */
@@ -2211,12 +2382,13 @@ static void ceph_fault(struct ceph_connection *con)
/* If there are no messages queued or keepalive pending, place
* the connection in a STANDBY state */
if (list_empty(&con->out_queue) &&
- !test_bit(KEEPALIVE_PENDING, &con->state)) {
+ !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) {
dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
- clear_bit(WRITE_PENDING, &con->state);
- set_bit(STANDBY, &con->state);
+ clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+ con->state = CON_STATE_STANDBY;
} else {
/* retry after a delay. */
+ con->state = CON_STATE_PREOPEN;
if (con->delay == 0)
con->delay = BASE_DELAY_INTERVAL;
else if (con->delay < MAX_DELAY_INTERVAL)
@@ -2237,13 +2409,12 @@ static void ceph_fault(struct ceph_connection *con)
* that when con_work restarts we schedule the
* delay then.
*/
- set_bit(BACKOFF, &con->state);
+ set_bit(CON_FLAG_BACKOFF, &con->flags);
}
}
out_unlock:
mutex_unlock(&con->mutex);
-out:
/*
* in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones.
@@ -2260,18 +2431,14 @@ out:
/*
- * create a new messenger instance
+ * initialize a new messenger instance
*/
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
- u32 supported_features,
- u32 required_features)
+void ceph_messenger_init(struct ceph_messenger *msgr,
+ struct ceph_entity_addr *myaddr,
+ u32 supported_features,
+ u32 required_features,
+ bool nocrc)
{
- struct ceph_messenger *msgr;
-
- msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
- if (msgr == NULL)
- return ERR_PTR(-ENOMEM);
-
msgr->supported_features = supported_features;
msgr->required_features = required_features;
@@ -2284,30 +2451,23 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
msgr->inst.addr.type = 0;
get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
encode_my_addr(msgr);
+ msgr->nocrc = nocrc;
- dout("messenger_create %p\n", msgr);
- return msgr;
-}
-EXPORT_SYMBOL(ceph_messenger_create);
+ atomic_set(&msgr->stopping, 0);
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
- dout("destroy %p\n", msgr);
- kfree(msgr);
- dout("destroyed messenger %p\n", msgr);
+ dout("%s %p\n", __func__, msgr);
}
-EXPORT_SYMBOL(ceph_messenger_destroy);
+EXPORT_SYMBOL(ceph_messenger_init);
static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
- if (test_and_clear_bit(STANDBY, &con->state)) {
- mutex_lock(&con->mutex);
+ if (con->state == CON_STATE_STANDBY) {
dout("clear_standby %p and ++connect_seq\n", con);
+ con->state = CON_STATE_PREOPEN;
con->connect_seq++;
- WARN_ON(test_bit(WRITE_PENDING, &con->state));
- WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
- mutex_unlock(&con->mutex);
+ WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags));
+ WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags));
}
}
@@ -2316,21 +2476,24 @@ static void clear_standby(struct ceph_connection *con)
*/
void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
{
- if (test_bit(CLOSED, &con->state)) {
- dout("con_send %p closed, dropping %p\n", con, msg);
- ceph_msg_put(msg);
- return;
- }
-
/* set src+dst */
msg->hdr.src = con->msgr->inst.name;
-
BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-
msg->needs_out_seq = true;
- /* queue */
mutex_lock(&con->mutex);
+
+ if (con->state == CON_STATE_CLOSED) {
+ dout("con_send %p closed, dropping %p\n", con, msg);
+ ceph_msg_put(msg);
+ mutex_unlock(&con->mutex);
+ return;
+ }
+
+ BUG_ON(msg->con != NULL);
+ msg->con = con->ops->get(con);
+ BUG_ON(msg->con == NULL);
+
BUG_ON(!list_empty(&msg->list_head));
list_add_tail(&msg->list_head, &con->out_queue);
dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
@@ -2339,12 +2502,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
le32_to_cpu(msg->hdr.front_len),
le32_to_cpu(msg->hdr.middle_len),
le32_to_cpu(msg->hdr.data_len));
+
+ clear_standby(con);
mutex_unlock(&con->mutex);
/* if there wasn't anything waiting to send before, queue
* new work */
- clear_standby(con);
- if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_send);
@@ -2352,24 +2516,34 @@ EXPORT_SYMBOL(ceph_con_send);
/*
* Revoke a message that was previously queued for send
*/
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+void ceph_msg_revoke(struct ceph_msg *msg)
{
+ struct ceph_connection *con = msg->con;
+
+ if (!con)
+ return; /* Message not in our possession */
+
mutex_lock(&con->mutex);
if (!list_empty(&msg->list_head)) {
- dout("con_revoke %p msg %p - was on queue\n", con, msg);
+ dout("%s %p msg %p - was on queue\n", __func__, con, msg);
list_del_init(&msg->list_head);
- ceph_msg_put(msg);
+ BUG_ON(msg->con == NULL);
+ msg->con->ops->put(msg->con);
+ msg->con = NULL;
msg->hdr.seq = 0;
+
+ ceph_msg_put(msg);
}
if (con->out_msg == msg) {
- dout("con_revoke %p msg %p - was sending\n", con, msg);
+ dout("%s %p msg %p - was sending\n", __func__, con, msg);
con->out_msg = NULL;
if (con->out_kvec_is_msg) {
con->out_skip = con->out_kvec_bytes;
con->out_kvec_is_msg = false;
}
- ceph_msg_put(msg);
msg->hdr.seq = 0;
+
+ ceph_msg_put(msg);
}
mutex_unlock(&con->mutex);
}
@@ -2377,17 +2551,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
/*
* Revoke a message that we may be reading data into
*/
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+void ceph_msg_revoke_incoming(struct ceph_msg *msg)
{
+ struct ceph_connection *con;
+
+ BUG_ON(msg == NULL);
+ if (!msg->con) {
+ dout("%s msg %p null con\n", __func__, msg);
+
+ return; /* Message not in our possession */
+ }
+
+ con = msg->con;
mutex_lock(&con->mutex);
- if (con->in_msg && con->in_msg == msg) {
+ if (con->in_msg == msg) {
unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
/* skip rest of message */
- dout("con_revoke_pages %p msg %p revoked\n", con, msg);
- con->in_base_pos = con->in_base_pos -
+ dout("%s %p msg %p revoked\n", __func__, con, msg);
+ con->in_base_pos = con->in_base_pos -
sizeof(struct ceph_msg_header) -
front_len -
middle_len -
@@ -2398,8 +2582,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
con->in_tag = CEPH_MSGR_TAG_READY;
con->in_seq++;
} else {
- dout("con_revoke_pages %p msg %p pages %p no-op\n",
- con, con->in_msg, msg);
+ dout("%s %p in_msg %p msg %p no-op\n",
+ __func__, con, con->in_msg, msg);
}
mutex_unlock(&con->mutex);
}
@@ -2410,9 +2594,11 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
void ceph_con_keepalive(struct ceph_connection *con)
{
dout("con_keepalive %p\n", con);
+ mutex_lock(&con->mutex);
clear_standby(con);
- if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
- test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ mutex_unlock(&con->mutex);
+ if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 &&
+ test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_keepalive);
@@ -2431,6 +2617,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
if (m == NULL)
goto out;
kref_init(&m->kref);
+
+ m->con = NULL;
INIT_LIST_HEAD(&m->list_head);
m->hdr.tid = 0;
@@ -2526,46 +2714,77 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
}
/*
- * Generic message allocator, for incoming messages.
+ * Allocate a message for receiving an incoming message on a
+ * connection, and save the result in con->in_msg. Uses the
+ * connection's private alloc_msg op if available.
+ *
+ * Returns 0 on success, or a negative error code.
+ *
+ * On success, if we set *skip = 1:
+ * - the next message should be skipped and ignored.
+ * - con->in_msg == NULL
+ * or if we set *skip = 0:
+ * - con->in_msg is non-null.
+ * On error (ENOMEM, EAGAIN, ...),
+ * - con->in_msg == NULL
*/
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
+static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
{
+ struct ceph_msg_header *hdr = &con->in_hdr;
int type = le16_to_cpu(hdr->type);
int front_len = le32_to_cpu(hdr->front_len);
int middle_len = le32_to_cpu(hdr->middle_len);
- struct ceph_msg *msg = NULL;
- int ret;
+ int ret = 0;
+
+ BUG_ON(con->in_msg != NULL);
if (con->ops->alloc_msg) {
+ struct ceph_msg *msg;
+
mutex_unlock(&con->mutex);
msg = con->ops->alloc_msg(con, hdr, skip);
mutex_lock(&con->mutex);
- if (!msg || *skip)
- return NULL;
+ if (con->state != CON_STATE_OPEN) {
+ ceph_msg_put(msg);
+ return -EAGAIN;
+ }
+ con->in_msg = msg;
+ if (con->in_msg) {
+ con->in_msg->con = con->ops->get(con);
+ BUG_ON(con->in_msg->con == NULL);
+ }
+ if (*skip) {
+ con->in_msg = NULL;
+ return 0;
+ }
+ if (!con->in_msg) {
+ con->error_msg =
+ "error allocating memory for incoming message";
+ return -ENOMEM;
+ }
}
- if (!msg) {
- *skip = 0;
- msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
- if (!msg) {
+ if (!con->in_msg) {
+ con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ if (!con->in_msg) {
pr_err("unable to allocate msg type %d len %d\n",
type, front_len);
- return NULL;
+ return -ENOMEM;
}
- msg->page_alignment = le16_to_cpu(hdr->data_off);
+ con->in_msg->con = con->ops->get(con);
+ BUG_ON(con->in_msg->con == NULL);
+ con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
}
- memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+ memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
- if (middle_len && !msg->middle) {
- ret = ceph_alloc_middle(con, msg);
+ if (middle_len && !con->in_msg->middle) {
+ ret = ceph_alloc_middle(con, con->in_msg);
if (ret < 0) {
- ceph_msg_put(msg);
- return NULL;
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
}
}
- return msg;
+ return ret;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index d0649a9655be..105d533b55f3 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
monc->pending_auth = 1;
monc->m_auth->front.iov_len = len;
monc->m_auth->hdr.front_len = cpu_to_le32(len);
- ceph_con_revoke(monc->con, monc->m_auth);
+ ceph_msg_revoke(monc->m_auth);
ceph_msg_get(monc->m_auth); /* keep our ref */
- ceph_con_send(monc->con, monc->m_auth);
+ ceph_con_send(&monc->con, monc->m_auth);
}
/*
@@ -117,8 +117,11 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
static void __close_session(struct ceph_mon_client *monc)
{
dout("__close_session closing mon%d\n", monc->cur_mon);
- ceph_con_revoke(monc->con, monc->m_auth);
- ceph_con_close(monc->con);
+ ceph_msg_revoke(monc->m_auth);
+ ceph_msg_revoke_incoming(monc->m_auth_reply);
+ ceph_msg_revoke(monc->m_subscribe);
+ ceph_msg_revoke_incoming(monc->m_subscribe_ack);
+ ceph_con_close(&monc->con);
monc->cur_mon = -1;
monc->pending_auth = 0;
ceph_auth_reset(monc->auth);
@@ -142,9 +145,8 @@ static int __open_session(struct ceph_mon_client *monc)
monc->want_next_osdmap = !!monc->want_next_osdmap;
dout("open_session mon%d opening\n", monc->cur_mon);
- monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
- monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
- ceph_con_open(monc->con,
+ ceph_con_open(&monc->con,
+ CEPH_ENTITY_TYPE_MON, monc->cur_mon,
&monc->monmap->mon_inst[monc->cur_mon].addr);
/* initiatiate authentication handshake */
@@ -226,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- ceph_con_revoke(monc->con, msg);
- ceph_con_send(monc->con, ceph_msg_get(msg));
+ ceph_msg_revoke(msg);
+ ceph_con_send(&monc->con, ceph_msg_get(msg));
monc->sub_sent = jiffies | 1; /* never 0 */
}
@@ -247,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
if (monc->hunting) {
pr_info("mon%d %s session established\n",
monc->cur_mon,
- ceph_pr_addr(&monc->con->peer_addr.in_addr));
+ ceph_pr_addr(&monc->con.peer_addr.in_addr));
monc->hunting = false;
}
dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -439,6 +441,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
m = NULL;
} else {
dout("get_generic_reply %lld got %p\n", tid, req->reply);
+ *skip = 0;
m = ceph_msg_get(req->reply);
/*
* we don't need to track the connection reading into
@@ -461,7 +464,7 @@ static int do_generic_request(struct ceph_mon_client *monc,
req->request->hdr.tid = cpu_to_le64(req->tid);
__insert_generic_request(monc, req);
monc->num_generic_requests++;
- ceph_con_send(monc->con, ceph_msg_get(req->request));
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
mutex_unlock(&monc->mutex);
err = wait_for_completion_interruptible(&req->completion);
@@ -684,8 +687,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
req = rb_entry(p, struct ceph_mon_generic_request, node);
- ceph_con_revoke(monc->con, req->request);
- ceph_con_send(monc->con, ceph_msg_get(req->request));
+ ceph_msg_revoke(req->request);
+ ceph_msg_revoke_incoming(req->reply);
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
}
}
@@ -705,7 +709,7 @@ static void delayed_work(struct work_struct *work)
__close_session(monc);
__open_session(monc); /* continue hunting */
} else {
- ceph_con_keepalive(monc->con);
+ ceph_con_keepalive(&monc->con);
__validate_auth(monc);
@@ -760,19 +764,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
goto out;
/* connection */
- monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
- if (!monc->con)
- goto out_monmap;
- ceph_con_init(monc->client->msgr, monc->con);
- monc->con->private = monc;
- monc->con->ops = &mon_con_ops;
-
/* authentication */
monc->auth = ceph_auth_init(cl->options->name,
cl->options->key);
if (IS_ERR(monc->auth)) {
err = PTR_ERR(monc->auth);
- goto out_con;
+ goto out_monmap;
}
monc->auth->want_keys =
CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
@@ -801,6 +798,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
if (!monc->m_auth)
goto out_auth_reply;
+ ceph_con_init(&monc->con, monc, &mon_con_ops,
+ &monc->client->msgr);
+
monc->cur_mon = -1;
monc->hunting = true;
monc->sub_renew_after = jiffies;
@@ -824,8 +824,6 @@ out_subscribe_ack:
ceph_msg_put(monc->m_subscribe_ack);
out_auth:
ceph_auth_destroy(monc->auth);
-out_con:
- monc->con->ops->put(monc->con);
out_monmap:
kfree(monc->monmap);
out:
@@ -841,10 +839,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
mutex_lock(&monc->mutex);
__close_session(monc);
- monc->con->private = NULL;
- monc->con->ops->put(monc->con);
- monc->con = NULL;
-
mutex_unlock(&monc->mutex);
/*
@@ -888,8 +882,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
dout("authenticated, starting session\n");
- monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
- monc->client->msgr->inst.name.num =
+ monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+ monc->client->msgr.inst.name.num =
cpu_to_le64(monc->auth->global_id);
__send_subscribe(monc);
@@ -1000,6 +994,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP:
m = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ if (!m)
+ return NULL; /* ENOMEM--return skip == 0 */
break;
}
@@ -1029,7 +1025,7 @@ static void mon_fault(struct ceph_connection *con)
if (!monc->hunting)
pr_info("mon%d %s session lost, "
"hunting for new mon\n", monc->cur_mon,
- ceph_pr_addr(&monc->con->peer_addr.in_addr));
+ ceph_pr_addr(&monc->con.peer_addr.in_addr));
__close_session(monc);
if (!monc->hunting) {
@@ -1044,9 +1040,23 @@ out:
mutex_unlock(&monc->mutex);
}
+/*
+ * We can ignore refcounting on the connection struct, as all references
+ * will come from the messenger workqueue, which is drained prior to
+ * mon_client destruction.
+ */
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+ return con;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+}
+
static const struct ceph_connection_operations mon_con_ops = {
- .get = ceph_con_get,
- .put = ceph_con_put,
+ .get = con_get,
+ .put = con_put,
.dispatch = dispatch,
.fault = mon_fault,
.alloc_msg = mon_alloc_msg,
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index 11d5f4196a73..ddec1c10ac80 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
struct ceph_msgpool *pool = arg;
struct ceph_msg *msg;
- msg = ceph_msg_new(0, pool->front_len, gfp_mask, true);
+ msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
if (!msg) {
dout("msgpool_alloc %s failed\n", pool->name);
} else {
@@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg)
ceph_msg_put(msg);
}
-int ceph_msgpool_init(struct ceph_msgpool *pool,
+int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
int front_len, int size, bool blocking, const char *name)
{
dout("msgpool %s init\n", name);
+ pool->type = type;
pool->front_len = front_len;
pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
if (!pool->pool)
@@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
WARN_ON(1);
/* try to alloc a fresh message */
- return ceph_msg_new(0, front_len, GFP_NOFS, false);
+ return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
}
msg = mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ca59e66c9787..42119c05e82c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -140,10 +140,9 @@ void ceph_osdc_release_request(struct kref *kref)
if (req->r_request)
ceph_msg_put(req->r_request);
if (req->r_con_filling_msg) {
- dout("release_request revoking pages %p from con %p\n",
+ dout("%s revoking pages %p from con %p\n", __func__,
req->r_pages, req->r_con_filling_msg);
- ceph_con_revoke_message(req->r_con_filling_msg,
- req->r_reply);
+ ceph_msg_revoke_incoming(req->r_reply);
req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
}
if (req->r_reply)
@@ -214,10 +213,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
kref_init(&req->r_kref);
init_completion(&req->r_completion);
init_completion(&req->r_safe_completion);
+ rb_init_node(&req->r_node);
INIT_LIST_HEAD(&req->r_unsafe_item);
INIT_LIST_HEAD(&req->r_linger_item);
INIT_LIST_HEAD(&req->r_linger_osd);
INIT_LIST_HEAD(&req->r_req_lru_item);
+ INIT_LIST_HEAD(&req->r_osd_item);
+
req->r_flags = flags;
WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
@@ -243,6 +245,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
}
ceph_pagelist_init(req->r_trail);
}
+
/* create request message; allow space for oid */
msg_size += MAX_OBJ_NAME_SIZE;
if (snapc)
@@ -256,7 +259,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
return NULL;
}
- msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
memset(msg->front.iov_base, 0, msg->front.iov_len);
req->r_request = msg;
@@ -624,7 +626,7 @@ static void osd_reset(struct ceph_connection *con)
/*
* Track open sessions with osds.
*/
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
{
struct ceph_osd *osd;
@@ -634,15 +636,13 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
atomic_set(&osd->o_ref, 1);
osd->o_osdc = osdc;
+ osd->o_osd = onum;
INIT_LIST_HEAD(&osd->o_requests);
INIT_LIST_HEAD(&osd->o_linger_requests);
INIT_LIST_HEAD(&osd->o_osd_lru);
osd->o_incarnation = 1;
- ceph_con_init(osdc->client->msgr, &osd->o_con);
- osd->o_con.private = osd;
- osd->o_con.ops = &osd_con_ops;
- osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+ ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
INIT_LIST_HEAD(&osd->o_keepalive_item);
return osd;
@@ -688,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
static void remove_all_osds(struct ceph_osd_client *osdc)
{
- dout("__remove_old_osds %p\n", osdc);
+ dout("%s %p\n", __func__, osdc);
mutex_lock(&osdc->request_mutex);
while (!RB_EMPTY_ROOT(&osdc->osds)) {
struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
@@ -752,7 +752,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
ret = -EAGAIN;
} else {
ceph_con_close(&osd->o_con);
- ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+ ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+ &osdc->osdmap->osd_addr[osd->o_osd]);
osd->o_incarnation++;
}
return ret;
@@ -853,7 +854,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
if (req->r_osd) {
/* make sure the original request isn't in flight. */
- ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+ ceph_msg_revoke(req->r_request);
list_del_init(&req->r_osd_item);
if (list_empty(&req->r_osd->o_requests) &&
@@ -880,7 +881,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
static void __cancel_request(struct ceph_osd_request *req)
{
if (req->r_sent && req->r_osd) {
- ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+ ceph_msg_revoke(req->r_request);
req->r_sent = 0;
}
}
@@ -890,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
{
dout("__register_linger_request %p\n", req);
list_add_tail(&req->r_linger_item, &osdc->req_linger);
- list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
+ if (req->r_osd)
+ list_add_tail(&req->r_linger_osd,
+ &req->r_osd->o_linger_requests);
}
static void __unregister_linger_request(struct ceph_osd_client *osdc,
@@ -998,18 +1001,18 @@ static int __map_request(struct ceph_osd_client *osdc,
req->r_osd = __lookup_osd(osdc, o);
if (!req->r_osd && o >= 0) {
err = -ENOMEM;
- req->r_osd = create_osd(osdc);
+ req->r_osd = create_osd(osdc, o);
if (!req->r_osd) {
list_move(&req->r_req_lru_item, &osdc->req_notarget);
goto out;
}
dout("map_request osd %p is osd%d\n", req->r_osd, o);
- req->r_osd->o_osd = o;
- req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
__insert_osd(osdc, req->r_osd);
- ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+ ceph_con_open(&req->r_osd->o_con,
+ CEPH_ENTITY_TYPE_OSD, o,
+ &osdc->osdmap->osd_addr[o]);
}
if (req->r_osd) {
@@ -1304,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ for (p = rb_first(&osdc->requests); p; ) {
req = rb_entry(p, struct ceph_osd_request, r_node);
+ p = rb_next(p);
err = __map_request(osdc, req, force_resend);
if (err < 0)
continue; /* error */
@@ -1313,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
dout("%p tid %llu maps to no osd\n", req, req->r_tid);
needmap++; /* request a newer map */
} else if (err > 0) {
- dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- if (!req->r_linger)
+ if (!req->r_linger) {
+ dout("%p tid %llu requeued on osd%d\n", req,
+ req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ }
+ }
+ if (req->r_linger && list_empty(&req->r_linger_item)) {
+ /*
+ * register as a linger so that we will
+ * re-submit below and get a new tid
+ */
+ dout("%p tid %llu restart on osd%d\n",
+ req, req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
+ __register_linger_request(osdc, req);
+ __unregister_request(osdc, req);
}
}
@@ -1391,7 +1408,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
epoch, maplen);
newmap = osdmap_apply_incremental(&p, next,
osdc->osdmap,
- osdc->client->msgr);
+ &osdc->client->msgr);
if (IS_ERR(newmap)) {
err = PTR_ERR(newmap);
goto bad;
@@ -1839,11 +1856,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
if (!osdc->req_mempool)
goto out;
- err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+ err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
+ OSD_OP_FRONT_LEN, 10, true,
"osd_op");
if (err < 0)
goto out_mempool;
- err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+ err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
OSD_OPREPLY_FRONT_LEN, 10, true,
"osd_op_reply");
if (err < 0)
@@ -2019,15 +2037,15 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
if (!req) {
*skip = 1;
m = NULL;
- pr_info("get_reply unknown tid %llu from osd%d\n", tid,
- osd->o_osd);
+ dout("get_reply unknown tid %llu from osd%d\n", tid,
+ osd->o_osd);
goto out;
}
if (req->r_con_filling_msg) {
- dout("get_reply revoking msg %p from old con %p\n",
+ dout("%s revoking msg %p from old con %p\n", __func__,
req->r_reply, req->r_con_filling_msg);
- ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+ ceph_msg_revoke_incoming(req->r_reply);
req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
req->r_con_filling_msg = NULL;
}
@@ -2080,6 +2098,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
int type = le16_to_cpu(hdr->type);
int front = le32_to_cpu(hdr->front_len);
+ *skip = 0;
switch (type) {
case CEPH_MSG_OSD_MAP:
case CEPH_MSG_WATCH_NOTIFY:
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 81e3b84a77ef..3124b71a8883 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -135,6 +135,21 @@ bad:
return -EINVAL;
}
+static int skip_name_map(void **p, void *end)
+{
+ int len;
+ ceph_decode_32_safe(p, end, len ,bad);
+ while (len--) {
+ int strlen;
+ *p += sizeof(u32);
+ ceph_decode_32_safe(p, end, strlen, bad);
+ *p += strlen;
+}
+ return 0;
+bad:
+ return -EINVAL;
+}
+
static struct crush_map *crush_decode(void *pbyval, void *end)
{
struct crush_map *c;
@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
void **p = &pbyval;
void *start = pbyval;
u32 magic;
+ u32 num_name_maps;
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
if (c == NULL)
return ERR_PTR(-ENOMEM);
+ /* set tunables to default values */
+ c->choose_local_tries = 2;
+ c->choose_local_fallback_tries = 5;
+ c->choose_total_tries = 19;
+
ceph_decode_need(p, end, 4*sizeof(u32), bad);
magic = ceph_decode_32(p);
if (magic != CRUSH_MAGIC) {
@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
}
/* ignore trailing name maps. */
+ for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
+ err = skip_name_map(p, end);
+ if (err < 0)
+ goto done;
+ }
+
+ /* tunables */
+ ceph_decode_need(p, end, 3*sizeof(u32), done);
+ c->choose_local_tries = ceph_decode_32(p);
+ c->choose_local_fallback_tries = ceph_decode_32(p);
+ c->choose_total_tries = ceph_decode_32(p);
+ dout("crush decode tunable choose_local_tries = %d",
+ c->choose_local_tries);
+ dout("crush decode tunable choose_local_fallback_tries = %d",
+ c->choose_local_fallback_tries);
+ dout("crush decode tunable choose_total_tries = %d",
+ c->choose_total_tries);
+done:
dout("crush_decode success\n");
return c;
@@ -488,15 +527,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
ceph_decode_32_safe(p, end, pool, bad);
ceph_decode_32_safe(p, end, len, bad);
dout(" pool %d len %d\n", pool, len);
+ ceph_decode_need(p, end, len, bad);
pi = __lookup_pg_pool(&map->pg_pools, pool);
if (pi) {
+ char *name = kstrndup(*p, len, GFP_NOFS);
+
+ if (!name)
+ return -ENOMEM;
kfree(pi->name);
- pi->name = kmalloc(len + 1, GFP_NOFS);
- if (pi->name) {
- memcpy(pi->name, *p, len);
- pi->name[len] = '\0';
- dout(" name is %s\n", pi->name);
- }
+ pi->name = name;
+ dout(" name is %s\n", pi->name);
}
*p += len;
}
@@ -666,6 +706,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
ceph_decode_copy(p, &pgid, sizeof(pgid));
n = ceph_decode_32(p);
+ err = -EINVAL;
+ if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+ goto bad;
ceph_decode_need(p, end, n * sizeof(u32), bad);
err = -ENOMEM;
pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
@@ -889,6 +932,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
(void) __remove_pg_mapping(&map->pg_temp, pgid);
/* insert */
+ if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
+ err = -EINVAL;
+ goto bad;
+ }
pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
if (!pg) {
err = -ENOMEM;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ebaea16632f..f91abf800161 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1172,6 +1172,7 @@ static int __dev_open(struct net_device *dev)
net_dmaengine_get();
dev_set_rx_mode(dev);
dev_activate(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
}
return ret;
@@ -2133,6 +2134,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
__be16 protocol = skb->protocol;
netdev_features_t features = skb->dev->features;
+ if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+ features &= ~NETIF_F_GSO_MASK;
+
if (protocol == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
protocol = veh->h_vlan_encapsulated_proto;
@@ -3155,6 +3159,23 @@ void netdev_rx_handler_unregister(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+/*
+ * Limit the use of PFMEMALLOC reserves to those protocols that implement
+ * the special handling of PFMEMALLOC skbs.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_ARP):
+ case __constant_htons(ETH_P_IP):
+ case __constant_htons(ETH_P_IPV6):
+ case __constant_htons(ETH_P_8021Q):
+ return true;
+ default:
+ return false;
+ }
+}
+
static int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
@@ -3164,14 +3185,27 @@ static int __netif_receive_skb(struct sk_buff *skb)
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
+ unsigned long pflags = current->flags;
net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb))
+ current->flags |= PF_MEMALLOC;
+
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
- return NET_RX_DROP;
+ goto out;
orig_dev = skb->dev;
@@ -3191,7 +3225,7 @@ another_round:
if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
skb = vlan_untag(skb);
if (unlikely(!skb))
- goto out;
+ goto unlock;
}
#ifdef CONFIG_NET_CLS_ACT
@@ -3201,6 +3235,9 @@ another_round:
}
#endif
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb))
+ goto skip_taps;
+
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
@@ -3209,13 +3246,18 @@ another_round:
}
}
+skip_taps:
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
- goto out;
+ goto unlock;
ncls:
#endif
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb)
+ && !skb_pfmemalloc_protocol(skb))
+ goto drop;
+
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (vlan_tx_tag_present(skb)) {
if (pt_prev) {
@@ -3225,7 +3267,7 @@ ncls:
if (vlan_do_receive(&skb, !rx_handler))
goto another_round;
else if (unlikely(!skb))
- goto out;
+ goto unlock;
}
if (rx_handler) {
@@ -3235,7 +3277,7 @@ ncls:
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
- goto out;
+ goto unlock;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
@@ -3268,6 +3310,7 @@ ncls:
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
+drop:
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
@@ -3276,8 +3319,10 @@ ncls:
ret = NET_RX_DROP;
}
-out:
+unlock:
rcu_read_unlock();
+out:
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
return ret;
}
@@ -4801,6 +4846,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
err = ops->ndo_set_mac_address(dev, sa);
if (!err)
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
return err;
}
EXPORT_SYMBOL(dev_set_mac_address);
@@ -5579,6 +5625,7 @@ int register_netdevice(struct net_device *dev)
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
@@ -5942,6 +5989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_segs = GSO_MAX_SEGS;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
diff --git a/net/core/filter.c b/net/core/filter.c
index d4ce2dc712e3..907efd27ec77 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
int err;
struct sk_filter *filter;
+ /*
+ * If the skb was allocated from pfmemalloc reserves, only
+ * allow SOCK_MEMALLOC sockets to use it as this socket is
+ * helping free memory
+ */
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+ return -ENOMEM;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ff949dc954f..2c5a0a06c4ce 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1381,6 +1381,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
goto errout;
send_addr_notify = 1;
modified = 1;
+ add_device_randomness(dev->dev_addr, dev->addr_len);
}
if (tb[IFLA_MTU]) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 368f65c15e4f..fe00d1208167 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
BUG();
}
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
+ bool *pfmemalloc)
+{
+ void *obj;
+ bool ret_pfmemalloc = false;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
+}
+
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
* 'private' fields and also do memory statistics to find all the
* [BEEP] leaks.
@@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
- * @fclone: allocate from fclone cache instead of head cache
- * and allocate a cloned (child) skb
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
* @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone, int node)
+ int flags, int node)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
+ bool pfmemalloc;
- cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
*/
size = SKB_DATA_ALIGN(size);
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc_node_track_caller(size, gfp_mask, node);
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
if (!data)
goto nodata;
/* kmalloc(size) might give us more room than requested.
@@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
memset(skb, 0, offsetof(struct sk_buff, tail));
/* Account for allocated memory : skb + skb->head */
skb->truesize = SKB_TRUESIZE(size);
+ skb->pfmemalloc = pfmemalloc;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(&shinfo->dataref, 1);
kmemcheck_annotate_variable(shinfo->destructor_arg);
- if (fclone) {
+ if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(fclone_ref, 1);
child->fclone = SKB_FCLONE_UNAVAILABLE;
+ child->pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
-/**
- * netdev_alloc_frag - allocate a page fragment
- * @fragsz: fragment size
- *
- * Allocates a frag from a page for receive buffer.
- * Uses GFP_ATOMIC allocations.
- */
-void *netdev_alloc_frag(unsigned int fragsz)
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
struct netdev_alloc_cache *nc;
void *data = NULL;
@@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz)
nc = &__get_cpu_var(netdev_alloc_cache);
if (unlikely(!nc->page)) {
refill:
- nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+ nc->page = alloc_page(gfp_mask);
if (unlikely(!nc->page))
goto end;
recycle:
@@ -343,6 +382,18 @@ end:
local_irq_restore(flags);
return data;
}
+
+/**
+ * netdev_alloc_frag - allocate a page fragment
+ * @fragsz: fragment size
+ *
+ * Allocates a frag from a page for receive buffer.
+ * Uses GFP_ATOMIC allocations.
+ */
+void *netdev_alloc_frag(unsigned int fragsz)
+{
+ return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
EXPORT_SYMBOL(netdev_alloc_frag);
/**
@@ -366,7 +417,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
- void *data = netdev_alloc_frag(fragsz);
+ void *data;
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = __netdev_alloc_frag(fragsz, gfp_mask);
if (likely(data)) {
skb = build_skb(data, fragsz);
@@ -374,7 +430,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
put_page(virt_to_head_page(data));
}
} else {
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
}
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
@@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#if IS_ENABLED(CONFIG_IP_VS)
new->ipvs_property = old->ipvs_property;
#endif
+ new->pfmemalloc = old->pfmemalloc;
new->protocol = old->protocol;
new->mark = old->mark;
new->skb_iif = old->skb_iif;
@@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->fclone = SKB_FCLONE_CLONE;
atomic_inc(fclone_ref);
} else {
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
@@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+ if (skb_pfmemalloc(skb))
+ return SKB_ALLOC_RX;
+ return 0;
+}
+
/**
* skb_copy - create private copy of an sk_buff
* @skb: buffer to copy
@@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb_headroom(skb);
unsigned int size = skb_end_offset(skb) + skb->data_len;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy);
struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
{
unsigned int size = skb_headlen(skb) + headroom;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
goto out;
@@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
size = SKB_DATA_ALIGN(size);
- data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask);
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
size = SKB_WITH_OVERHEAD(ksize(data));
@@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask);
+ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
int off;
@@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
- nskb = alloc_skb(hsize + doffset + headroom,
- GFP_ATOMIC);
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 2676a88f533e..8f67ced8d6a8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -142,7 +142,7 @@
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
struct proto *proto;
@@ -271,6 +271,61 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);
+struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(memalloc_socks);
+
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+ static_key_slow_inc(&memalloc_socks);
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+ static_key_slow_dec(&memalloc_socks);
+
+ /*
+ * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+ * progress of swapping. However, if SOCK_MEMALLOC is cleared while
+ * it has rmem allocations there is a risk that the user of the
+ * socket cannot make forward progress due to exceeding the rmem
+ * limits. By rights, sk_clear_memalloc() should only be called
+ * on sockets being torn down but warn and reset the accounting if
+ * that assumption breaks.
+ */
+ if (WARN_ON(sk->sk_forward_alloc))
+ sk_mem_reclaim(sk);
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ unsigned long pflags = current->flags;
+
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ current->flags |= PF_MEMALLOC;
+ ret = sk->sk_backlog_rcv(sk, skb);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
+
#if defined(CONFIG_CGROUPS)
#if !defined(CONFIG_NET_CLS_CGROUP)
int net_cls_subsys_id = -1;
@@ -353,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (err)
return err;
- if (!sk_rmem_schedule(sk, skb->truesize)) {
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
atomic_inc(&sk->sk_drops);
return -ENOBUFS;
}
@@ -1403,6 +1458,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
+ sk->sk_gso_max_segs = dst->dev->gso_max_segs;
}
}
}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ae2ccf2890e4..15ca63ec604e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c035251beb07..e4ba974f143c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -70,7 +70,6 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -80,7 +79,6 @@
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
-#include <linux/workqueue.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
@@ -88,11 +86,9 @@
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
-#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
#include <linux/slab.h>
-#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b6487a68279..1b5ce96707a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
int ret;
unsigned long vec[3];
struct net *net = current->nsproxy->net_ns;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg;
#endif
@@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
if (ret)
return ret;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e7e6eeae49c0..2109ff4a1daf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -811,7 +811,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
old_size_goal + mss_now > xmit_size_goal)) {
xmit_size_goal = old_size_goal;
} else {
- tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+ tp->xmit_size_goal_segs =
+ min_t(u16, xmit_size_goal / mss_now,
+ sk->sk_gso_max_segs);
xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
}
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4d4db16e336e..1432cdb0644c 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -291,7 +291,8 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
left = tp->snd_cwnd - in_flight;
if (sk_can_gso(sk) &&
left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
- left * tp->mss_cache < sk->sk_gso_max_size)
+ left * tp->mss_cache < sk->sk_gso_max_size &&
+ left < sk->sk_gso_max_segs)
return true;
return left <= tcp_max_tso_deferred_mss(tp);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9be30b039ae3..2fd2bc9e3c64 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
-static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_rmem_schedule(sk, size)) {
+ !sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk) < 0)
return -1;
- if (!sk_rmem_schedule(sk, size)) {
+ if (!sk_rmem_schedule(sk, skb, size)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
- if (!sk_rmem_schedule(sk, size))
+ if (!sk_rmem_schedule(sk, skb, size))
return -1;
}
}
@@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
TCP_ECN_check_ce(tp, skb);
- if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
+ if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
__kfree_skb(skb);
return;
@@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
struct tcphdr *th;
bool fragstolen;
- if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
- goto err;
-
skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
if (!skb)
goto err;
+ if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
+ goto err_free;
+
th = (struct tcphdr *)skb_put(skb, sizeof(*th));
skb_reset_transport_header(skb);
memset(th, 0, sizeof(*th));
@@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb->truesize))
+ tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto drop;
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7f91e5ac8277..42b2a6a73092 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2633,7 +2633,7 @@ struct proto tcp_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 33cd065cfbd8..a7b3ec9b6c3e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1522,21 +1522,21 @@ static void tcp_cwnd_validate(struct sock *sk)
* when we would be allowed to send the split-due-to-Nagle skb fully.
*/
static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
- unsigned int mss_now, unsigned int cwnd)
+ unsigned int mss_now, unsigned int max_segs)
{
const struct tcp_sock *tp = tcp_sk(sk);
- u32 needed, window, cwnd_len;
+ u32 needed, window, max_len;
window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
- cwnd_len = mss_now * cwnd;
+ max_len = mss_now * max_segs;
- if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
- return cwnd_len;
+ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+ return max_len;
needed = min(skb->len, window);
- if (cwnd_len <= needed)
- return cwnd_len;
+ if (max_len <= needed)
+ return max_len;
return needed - needed % mss_now;
}
@@ -1765,7 +1765,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
limit = min(send_win, cong_win);
/* If a full-sized TSO skb can be sent, do it. */
- if (limit >= sk->sk_gso_max_size)
+ if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
+ sk->sk_gso_max_segs * tp->mss_cache))
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
@@ -1999,7 +2000,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
- cwnd_quota);
+ min_t(unsigned int,
+ cwnd_quota,
+ sk->sk_gso_max_segs));
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -2045,7 +2048,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
- if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
+ if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+ sk_gfp_atomic(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
@@ -2666,7 +2670,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
s_data_desired = cvp->s_data_desired;
- skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
+ sk_gfp_atomic(sk, GFP_ATOMIC));
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -3064,7 +3069,7 @@ void tcp_send_ack(struct sock *sk)
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
if (buff == NULL) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -3079,7 +3084,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+ tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
}
/* This routine sends a packet with an out of date sequence
@@ -3099,7 +3104,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
if (skb == NULL)
return -1;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 221224e72507..c66b90f71c9b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1299,7 +1299,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
/* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
if (treq->pktopts != NULL) {
- newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC);
+ newnp->pktoptions = skb_clone(treq->pktopts,
+ sk_gfp_atomic(sk, GFP_ATOMIC));
consume_skb(treq->pktopts);
treq->pktopts = NULL;
if (newnp->pktoptions)
@@ -1349,7 +1350,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr,
- AF_INET6, key->key, key->keylen, GFP_ATOMIC);
+ AF_INET6, key->key, key->keylen,
+ sk_gfp_atomic(sk, GFP_ATOMIC));
}
#endif
@@ -1442,7 +1444,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
--ANK (980728)
*/
if (np->rxopt.all)
- opt_skb = skb_clone(skb, GFP_ATOMIC);
+ opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC));
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
sock_rps_save_rxhash(sk, skb);
@@ -2015,7 +2017,7 @@ struct proto tcpv6_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
.proto_cgroup = tcp_proto_cgroup,
#endif
};
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 929f897a8ded..03fe6d1cff42 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1389,6 +1389,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
else
memset(next_hop, 0, ETH_ALEN);
+ memset(pinfo, 0, sizeof(*pinfo));
+
pinfo->generation = mesh_paths_generation;
pinfo->filled = MPATH_INFO_FRAME_QLEN |
@@ -1407,7 +1409,6 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
pinfo->discovery_timeout =
jiffies_to_msecs(mpath->discovery_timeout);
pinfo->discovery_retries = mpath->discovery_retries;
- pinfo->flags = 0;
if (mpath->flags & MESH_PATH_ACTIVE)
pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE;
if (mpath->flags & MESH_PATH_RESOLVING)
@@ -1416,10 +1417,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID;
if (mpath->flags & MESH_PATH_FIXED)
pinfo->flags |= NL80211_MPATH_FLAG_FIXED;
- if (mpath->flags & MESH_PATH_RESOLVING)
- pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING;
-
- pinfo->flags = mpath->flags;
+ if (mpath->flags & MESH_PATH_RESOLVED)
+ pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED;
}
static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index f4a636ffe023..ff0296c7bab8 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -637,6 +637,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
del_timer_sync(&sdata->u.mesh.housekeeping_timer);
del_timer_sync(&sdata->u.mesh.mesh_path_root_timer);
+ del_timer_sync(&sdata->u.mesh.mesh_path_timer);
/*
* If the timer fired while we waited for it, it will have
* requeued the work. Now the work will be running again
@@ -649,6 +650,8 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
local->fif_other_bss--;
atomic_dec(&local->iff_allmultis);
ieee80211_configure_filter(local);
+
+ sdata->u.mesh.timers_running = 0;
}
static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b65b2149b23b..5d77650d4363 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1467,6 +1467,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
del_timer_sync(&sdata->u.mgd.timer);
del_timer_sync(&sdata->u.mgd.chswitch_timer);
+
+ sdata->u.mgd.timers_running = 0;
}
void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
@@ -3298,6 +3300,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
goto out_unlock;
err_clear:
+ memset(ifmgd->bssid, 0, ETH_ALEN);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
ifmgd->auth_data = NULL;
err_free:
kfree(auth_data);
@@ -3506,6 +3510,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
err = 0;
goto out;
err_clear:
+ memset(ifmgd->bssid, 0, ETH_ALEN);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
ifmgd->assoc_data = NULL;
err_free:
kfree(assoc_data);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index ef1d69306315..740e414d44f4 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -299,7 +299,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted,
if (local->scan_req != local->int_scan_req)
cfg80211_scan_done(local->scan_req, aborted);
local->scan_req = NULL;
- local->scan_sdata = NULL;
+ rcu_assign_pointer(local->scan_sdata, NULL);
local->scanning = 0;
local->scan_channel = NULL;
@@ -984,7 +984,6 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata)
kfree(local->sched_scan_ies.ie[i]);
drv_sched_scan_stop(local, sdata);
- rcu_assign_pointer(local->sched_scan_sdata, NULL);
}
out:
mutex_unlock(&local->mtx);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3b807bcb8fc9..29eb4e678235 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1807,37 +1807,31 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr,
sdata, NULL, NULL);
} else {
- int is_mesh_mcast = 1;
- const u8 *mesh_da;
+ /* DS -> MBSS (802.11-2012 13.11.3.3).
+ * For unicast with unknown forwarding information,
+ * destination might be in the MBSS or if that fails
+ * forwarded to another mesh gate. In either case
+ * resolution will be handled in ieee80211_xmit(), so
+ * leave the original DA. This also works for mcast */
+ const u8 *mesh_da = skb->data;
+
+ if (mppath)
+ mesh_da = mppath->mpp;
+ else if (mpath)
+ mesh_da = mpath->dst;
+ rcu_read_unlock();
- if (is_multicast_ether_addr(skb->data))
- /* DA TA mSA AE:SA */
- mesh_da = skb->data;
- else {
- static const u8 bcast[ETH_ALEN] =
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
- if (mppath) {
- /* RA TA mDA mSA AE:DA SA */
- mesh_da = mppath->mpp;
- is_mesh_mcast = 0;
- } else if (mpath) {
- mesh_da = mpath->dst;
- is_mesh_mcast = 0;
- } else {
- /* DA TA mSA AE:SA */
- mesh_da = bcast;
- }
- }
hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc,
mesh_da, sdata->vif.addr);
- rcu_read_unlock();
- if (is_mesh_mcast)
+ if (is_multicast_ether_addr(mesh_da))
+ /* DA TA mSA AE:SA */
meshhdrlen =
ieee80211_new_mesh_header(&mesh_hdr,
sdata,
skb->data + ETH_ALEN,
NULL);
else
+ /* RA TA mDA mSA AE:DA SA */
meshhdrlen =
ieee80211_new_mesh_header(&mesh_hdr,
sdata,
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 33d894776192..10c018a5b9fe 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
if (rx_count >= asoc->base.sk->sk_rcvbuf) {
if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
- (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize)))
+ (!sk_rmem_schedule(asoc->base.sk, chunk->skb,
+ chunk->skb->truesize)))
goto fail;
}
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 9fe8857d8d59..03d03e37a7d5 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -21,6 +21,11 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
+config SUNRPC_SWAP
+ bool
+ depends on SUNRPC
+ select NETVM
+
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism"
depends on SUNRPC && CRYPTO
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 727e506cacda..b5c067bccc45 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -13,6 +13,7 @@
#include <linux/errno.h>
#include <linux/hash.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
#include <linux/spinlock.h>
#ifdef RPC_DEBUG
@@ -122,6 +123,59 @@ rpcauth_unregister(const struct rpc_authops *ops)
}
EXPORT_SYMBOL_GPL(rpcauth_unregister);
+/**
+ * rpcauth_list_flavors - discover registered flavors and pseudoflavors
+ * @array: array to fill in
+ * @size: size of "array"
+ *
+ * Returns the number of array items filled in, or a negative errno.
+ *
+ * The returned array is not sorted by any policy. Callers should not
+ * rely on the order of the items in the returned array.
+ */
+int
+rpcauth_list_flavors(rpc_authflavor_t *array, int size)
+{
+ rpc_authflavor_t flavor;
+ int result = 0;
+
+ spin_lock(&rpc_authflavor_lock);
+ for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
+ const struct rpc_authops *ops = auth_flavors[flavor];
+ rpc_authflavor_t pseudos[4];
+ int i, len;
+
+ if (result >= size) {
+ result = -ENOMEM;
+ break;
+ }
+
+ if (ops == NULL)
+ continue;
+ if (ops->list_pseudoflavors == NULL) {
+ array[result++] = ops->au_flavor;
+ continue;
+ }
+ len = ops->list_pseudoflavors(pseudos, ARRAY_SIZE(pseudos));
+ if (len < 0) {
+ result = len;
+ break;
+ }
+ for (i = 0; i < len; i++) {
+ if (result >= size) {
+ result = -ENOMEM;
+ break;
+ }
+ array[result++] = pseudos[i];
+ }
+ }
+ spin_unlock(&rpc_authflavor_lock);
+
+ dprintk("RPC: %s returns %d\n", __func__, result);
+ return result;
+}
+EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
+
struct rpc_auth *
rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
{
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index d3ad81f8da5b..34c522021004 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1619,6 +1619,7 @@ static const struct rpc_authops authgss_ops = {
.crcreate = gss_create_cred,
.pipes_create = gss_pipes_dentries_create,
.pipes_destroy = gss_pipes_dentries_destroy,
+ .list_pseudoflavors = gss_mech_list_pseudoflavors,
};
static const struct rpc_credops gss_credops = {
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 782bfe1b6465..b174fcd9ff4c 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -239,14 +239,28 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor);
-int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr)
+/**
+ * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
+ * @array: array to fill in
+ * @size: size of "array"
+ *
+ * Returns the number of array items filled in, or a negative errno.
+ *
+ * The returned array is not sorted by any policy. Callers should not
+ * rely on the order of the items in the returned array.
+ */
+int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
{
struct gss_api_mech *pos = NULL;
int j, i = 0;
spin_lock(&registered_mechs_lock);
list_for_each_entry(pos, &registered_mechs, gm_list) {
- for (j=0; j < pos->gm_pf_num; j++) {
+ for (j = 0; j < pos->gm_pf_num; j++) {
+ if (i >= size) {
+ spin_unlock(&registered_mechs_lock);
+ return -ENOMEM;
+ }
array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
}
}
@@ -254,8 +268,6 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr)
return i;
}
-EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors);
-
u32
gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service)
{
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 47ad2666fdf6..2afd2a84dc35 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1349,8 +1349,11 @@ static int c_show(struct seq_file *m, void *p)
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
seq_printf(m, "# ");
- else
+ else {
+ if (cache_is_expired(cd, cp))
+ seq_printf(m, "# ");
cache_put(cp, cd);
+ }
return cd->cache_show(m, cd, cp);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 00eb859b7de5..fa48c60aef23 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -717,6 +717,15 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
atomic_inc(&clnt->cl_count);
if (clnt->cl_softrtry)
task->tk_flags |= RPC_TASK_SOFT;
+ if (sk_memalloc_socks()) {
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ if (xprt->swapper)
+ task->tk_flags |= RPC_TASK_SWAPPER;
+ rcu_read_unlock();
+ }
/* Add to the client's list of all tasks */
spin_lock(&clnt->cl_lock);
list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -1844,12 +1853,13 @@ call_timeout(struct rpc_task *task)
return;
}
if (RPC_IS_SOFT(task)) {
- if (clnt->cl_chatty)
+ if (clnt->cl_chatty) {
rcu_read_lock();
printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
clnt->cl_protname,
rcu_dereference(clnt->cl_xprt)->servername);
rcu_read_unlock();
+ }
if (task->tk_flags & RPC_TASK_TIMEOUT)
rpc_exit(task, -ETIMEDOUT);
else
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 92509ffe15fc..a70acae496e4 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -251,7 +251,7 @@ static int rpcb_create_local_unix(struct net *net)
if (IS_ERR(clnt)) {
dprintk("RPC: failed to create AF_LOCAL rpcbind "
"client (errno %ld).\n", PTR_ERR(clnt));
- result = -PTR_ERR(clnt);
+ result = PTR_ERR(clnt);
goto out;
}
@@ -298,7 +298,7 @@ static int rpcb_create_local_net(struct net *net)
if (IS_ERR(clnt)) {
dprintk("RPC: failed to create local rpcbind "
"client (errno %ld).\n", PTR_ERR(clnt));
- result = -PTR_ERR(clnt);
+ result = PTR_ERR(clnt);
goto out;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 994cfea2bad6..128494ec9a64 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -300,8 +300,9 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
/*
* Make an RPC task runnable.
*
- * Note: If the task is ASYNC, this must be called with
- * the spinlock held to protect the wait queue operation.
+ * Note: If the task is ASYNC, and is being made runnable after sitting on an
+ * rpc_wait_queue, this must be called with the queue spinlock held to protect
+ * the wait queue operation.
*/
static void rpc_make_runnable(struct rpc_task *task)
{
@@ -790,7 +791,9 @@ void rpc_execute(struct rpc_task *task)
static void rpc_async_schedule(struct work_struct *work)
{
+ current->flags |= PF_FSTRANS;
__rpc_execute(container_of(work, struct rpc_task, u.tk_work));
+ current->flags &= ~PF_FSTRANS;
}
/**
@@ -812,7 +815,10 @@ static void rpc_async_schedule(struct work_struct *work)
void *rpc_malloc(struct rpc_task *task, size_t size)
{
struct rpc_buffer *buf;
- gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT;
+ gfp_t gfp = GFP_NOWAIT;
+
+ if (RPC_IS_SWAPPER(task))
+ gfp |= __GFP_MEMALLOC;
size += sizeof(struct rpc_buffer);
if (size <= RPC_BUFFER_MAXSIZE)
@@ -886,7 +892,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
static struct rpc_task *
rpc_alloc_task(void)
{
- return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
+ return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
}
/*
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 0cf165580d8d..0afba1b4b656 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -129,34 +129,6 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
EXPORT_SYMBOL_GPL(xdr_terminate_string);
void
-xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
- unsigned int len)
-{
- struct kvec *tail = xdr->tail;
- u32 *p;
-
- xdr->pages = pages;
- xdr->page_base = base;
- xdr->page_len = len;
-
- p = (u32 *)xdr->head[0].iov_base + XDR_QUADLEN(xdr->head[0].iov_len);
- tail->iov_base = p;
- tail->iov_len = 0;
-
- if (len & 3) {
- unsigned int pad = 4 - (len & 3);
-
- *p = 0;
- tail->iov_base = (char *)p + (len & 3);
- tail->iov_len = pad;
- len += pad;
- }
- xdr->buflen += len;
- xdr->len += len;
-}
-EXPORT_SYMBOL_GPL(xdr_encode_pages);
-
-void
xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
struct page **pages, unsigned int base, unsigned int len)
{
@@ -457,6 +429,16 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
EXPORT_SYMBOL_GPL(xdr_shift_buf);
/**
+ * xdr_stream_pos - Return the current offset from the start of the xdr_stream
+ * @xdr: pointer to struct xdr_stream
+ */
+unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
+{
+ return (unsigned int)(XDR_QUADLEN(xdr->buf->len) - xdr->nwords) << 2;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_pos);
+
+/**
* xdr_init_encode - Initialize a struct xdr_stream for sending data.
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer in which to encode data
@@ -556,13 +538,11 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
EXPORT_SYMBOL_GPL(xdr_write_pages);
static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
- __be32 *p, unsigned int len)
+ unsigned int len)
{
if (len > iov->iov_len)
len = iov->iov_len;
- if (p == NULL)
- p = (__be32*)iov->iov_base;
- xdr->p = p;
+ xdr->p = (__be32*)iov->iov_base;
xdr->end = (__be32*)(iov->iov_base + len);
xdr->iov = iov;
xdr->page_ptr = NULL;
@@ -609,7 +589,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
newbase -= xdr->buf->page_base;
if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
}
static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -618,7 +598,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
xdr_set_next_page(xdr);
else if (xdr->iov == xdr->buf->head) {
if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
}
return xdr->p != xdr->end;
}
@@ -634,10 +614,15 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
xdr->buf = buf;
xdr->scratch.iov_base = NULL;
xdr->scratch.iov_len = 0;
+ xdr->nwords = XDR_QUADLEN(buf->len);
if (buf->head[0].iov_len != 0)
- xdr_set_iov(xdr, buf->head, p, buf->len);
+ xdr_set_iov(xdr, buf->head, buf->len);
else if (buf->page_len != 0)
xdr_set_page_base(xdr, 0, buf->len);
+ if (p != NULL && p > xdr->p && xdr->end >= p) {
+ xdr->nwords -= p - xdr->p;
+ xdr->p = p;
+ }
}
EXPORT_SYMBOL_GPL(xdr_init_decode);
@@ -662,12 +647,14 @@ EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
{
+ unsigned int nwords = XDR_QUADLEN(nbytes);
__be32 *p = xdr->p;
- __be32 *q = p + XDR_QUADLEN(nbytes);
+ __be32 *q = p + nwords;
- if (unlikely(q > xdr->end || q < p))
+ if (unlikely(nwords > xdr->nwords || q > xdr->end || q < p))
return NULL;
xdr->p = q;
+ xdr->nwords -= nwords;
return p;
}
@@ -734,6 +721,31 @@ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
}
EXPORT_SYMBOL_GPL(xdr_inline_decode);
+static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *iov;
+ unsigned int nwords = XDR_QUADLEN(len);
+ unsigned int cur = xdr_stream_pos(xdr);
+
+ if (xdr->nwords == 0)
+ return 0;
+ if (nwords > xdr->nwords) {
+ nwords = xdr->nwords;
+ len = nwords << 2;
+ }
+ /* Realign pages to current pointer position */
+ iov = buf->head;
+ if (iov->iov_len > cur)
+ xdr_shrink_bufhead(buf, iov->iov_len - cur);
+
+ /* Truncate page data and move it into the tail */
+ if (buf->page_len > len)
+ xdr_shrink_pagelen(buf, buf->page_len - len);
+ xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ return len;
+}
+
/**
* xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
* @xdr: pointer to xdr_stream struct
@@ -742,39 +754,37 @@ EXPORT_SYMBOL_GPL(xdr_inline_decode);
* Moves data beyond the current pointer position from the XDR head[] buffer
* into the page list. Any data that lies beyond current position + "len"
* bytes is moved into the XDR tail[].
+ *
+ * Returns the number of XDR encoded bytes now contained in the pages
*/
-void xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
+unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
{
struct xdr_buf *buf = xdr->buf;
struct kvec *iov;
- ssize_t shift;
+ unsigned int nwords;
unsigned int end;
- int padding;
+ unsigned int padding;
- /* Realign pages to current pointer position */
- iov = buf->head;
- shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p;
- if (shift > 0)
- xdr_shrink_bufhead(buf, shift);
-
- /* Truncate page data and move it into the tail */
- if (buf->page_len > len)
- xdr_shrink_pagelen(buf, buf->page_len - len);
- padding = (XDR_QUADLEN(len) << 2) - len;
+ len = xdr_align_pages(xdr, len);
+ if (len == 0)
+ return 0;
+ nwords = XDR_QUADLEN(len);
+ padding = (nwords << 2) - len;
xdr->iov = iov = buf->tail;
/* Compute remaining message length. */
- end = iov->iov_len;
- shift = buf->buflen - buf->len;
- if (shift < end)
- end -= shift;
- else if (shift > 0)
- end = 0;
+ end = ((xdr->nwords - nwords) << 2) + padding;
+ if (end > iov->iov_len)
+ end = iov->iov_len;
+
/*
* Position current pointer at beginning of tail, and
* set remaining message length.
*/
xdr->p = (__be32 *)((char *)iov->iov_base + padding);
xdr->end = (__be32 *)((char *)iov->iov_base + end);
+ xdr->page_ptr = NULL;
+ xdr->nwords = XDR_QUADLEN(end - padding);
+ return len;
}
EXPORT_SYMBOL_GPL(xdr_read_pages);
@@ -790,12 +800,13 @@ EXPORT_SYMBOL_GPL(xdr_read_pages);
*/
void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
{
- xdr_read_pages(xdr, len);
+ len = xdr_align_pages(xdr, len);
/*
* Position current pointer at beginning of tail, and
* set remaining message length.
*/
- xdr_set_page_base(xdr, 0, len);
+ if (len != 0)
+ xdr_set_page_base(xdr, 0, len);
}
EXPORT_SYMBOL_GPL(xdr_enter_page);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b446e100286f..06cdbff79e4a 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -200,6 +200,7 @@ xprt_rdma_connect_worker(struct work_struct *work)
int rc = 0;
if (!xprt->shutdown) {
+ current->flags |= PF_FSTRANS;
xprt_clear_connected(xprt);
dprintk("RPC: %s: %sconnect\n", __func__,
@@ -212,10 +213,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
out:
xprt_wake_pending_tasks(xprt, rc);
-
out_clear:
dprintk("RPC: %s: exit\n", __func__);
xprt_clear_connecting(xprt);
+ current->flags &= ~PF_FSTRANS;
}
/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 62d0dac8f780..400567243f84 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1892,6 +1892,8 @@ static void xs_local_setup_socket(struct work_struct *work)
if (xprt->shutdown)
goto out;
+ current->flags |= PF_FSTRANS;
+
clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
status = __sock_create(xprt->xprt_net, AF_LOCAL,
SOCK_STREAM, 0, &sock, 1);
@@ -1925,7 +1927,47 @@ static void xs_local_setup_socket(struct work_struct *work)
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ current->flags &= ~PF_FSTRANS;
+}
+
+#ifdef CONFIG_SUNRPC_SWAP
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+ xprt);
+
+ if (xprt->swapper)
+ sk_set_memalloc(transport->inet);
+}
+
+/**
+ * xs_swapper - Tag this transport as being used for swap.
+ * @xprt: transport to tag
+ * @enable: enable/disable
+ *
+ */
+int xs_swapper(struct rpc_xprt *xprt, int enable)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+ xprt);
+ int err = 0;
+
+ if (enable) {
+ xprt->swapper++;
+ xs_set_memalloc(xprt);
+ } else if (xprt->swapper) {
+ xprt->swapper--;
+ sk_clear_memalloc(transport->inet);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xs_swapper);
+#else
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
}
+#endif
static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
@@ -1951,6 +1993,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
transport->sock = sock;
transport->inet = sk;
+ xs_set_memalloc(xprt);
+
write_unlock_bh(&sk->sk_callback_lock);
}
xs_udp_do_set_buffer_size(xprt);
@@ -1967,6 +2011,8 @@ static void xs_udp_setup_socket(struct work_struct *work)
if (xprt->shutdown)
goto out;
+ current->flags |= PF_FSTRANS;
+
/* Start by resetting any existing state */
xs_reset_transport(transport);
sock = xs_create_sock(xprt, transport,
@@ -1985,6 +2031,7 @@ static void xs_udp_setup_socket(struct work_struct *work)
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ current->flags &= ~PF_FSTRANS;
}
/*
@@ -2075,6 +2122,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
if (!xprt_bound(xprt))
goto out;
+ xs_set_memalloc(xprt);
+
/* Tell the socket layer to start connecting... */
xprt->stat.connect_count++;
xprt->stat.connect_start = jiffies;
@@ -2110,6 +2159,8 @@ static void xs_tcp_setup_socket(struct work_struct *work)
if (xprt->shutdown)
goto out;
+ current->flags |= PF_FSTRANS;
+
if (!sock) {
clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
sock = xs_create_sock(xprt, transport,
@@ -2159,6 +2210,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -EINPROGRESS:
case -EALREADY:
xprt_clear_connecting(xprt);
+ current->flags &= ~PF_FSTRANS;
return;
case -EINVAL:
/* Happens, for instance, if the user specified a link
@@ -2171,6 +2223,7 @@ out_eagain:
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ current->flags &= ~PF_FSTRANS;
}
/**
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 79981d97bc9c..e4768c180da2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -823,6 +823,34 @@ fail:
return NULL;
}
+static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+{
+ struct dentry *dentry;
+ struct path path;
+ int err = 0;
+ /*
+ * Get the parent directory, calculate the hash for last
+ * component.
+ */
+ dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ return err;
+
+ /*
+ * All right, let's create it.
+ */
+ err = security_path_mknod(&path, dentry, mode, 0);
+ if (!err) {
+ err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
+ if (!err) {
+ res->mnt = mntget(path.mnt);
+ res->dentry = dget(dentry);
+ }
+ }
+ done_path_create(&path, dentry);
+ return err;
+}
static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
@@ -831,8 +859,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
char *sun_path = sunaddr->sun_path;
- struct dentry *dentry = NULL;
- struct path path;
int err;
unsigned int hash;
struct unix_address *addr;
@@ -869,43 +895,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
atomic_set(&addr->refcnt, 1);
if (sun_path[0]) {
- umode_t mode;
- err = 0;
- /*
- * Get the parent directory, calculate the hash for last
- * component.
- */
- dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- goto out_mknod_parent;
-
- /*
- * All right, let's create it.
- */
- mode = S_IFSOCK |
+ struct path path;
+ umode_t mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current_umask());
- err = mnt_want_write(path.mnt);
- if (err)
- goto out_mknod_dput;
- err = security_path_mknod(&path, dentry, mode, 0);
- if (err)
- goto out_mknod_drop_write;
- err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
-out_mknod_drop_write:
- mnt_drop_write(path.mnt);
- if (err)
- goto out_mknod_dput;
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- dput(path.dentry);
- path.dentry = dentry;
-
+ err = unix_mknod(sun_path, mode, &path);
+ if (err) {
+ if (err == -EEXIST)
+ err = -EADDRINUSE;
+ unix_release_addr(addr);
+ goto out_up;
+ }
addr->hash = UNIX_HASH_SIZE;
- }
-
- spin_lock(&unix_table_lock);
-
- if (!sun_path[0]) {
+ hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
+ spin_lock(&unix_table_lock);
+ u->path = path;
+ list = &unix_socket_table[hash];
+ } else {
+ spin_lock(&unix_table_lock);
err = -EADDRINUSE;
if (__unix_find_socket_byname(net, sunaddr, addr_len,
sk->sk_type, hash)) {
@@ -914,9 +920,6 @@ out_mknod_drop_write:
}
list = &unix_socket_table[addr->hash];
- } else {
- list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
- u->path = path;
}
err = 0;
@@ -930,16 +933,6 @@ out_up:
mutex_unlock(&u->readlock);
out:
return err;
-
-out_mknod_dput:
- dput(dentry);
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- path_put(&path);
-out_mknod_parent:
- if (err == -EEXIST)
- err = -EADDRINUSE;
- unix_release_addr(addr);
- goto out_up;
}
static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 91b300443f4b..443d4d7deea2 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1001,6 +1001,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
*/
synchronize_rcu();
INIT_LIST_HEAD(&wdev->list);
+ /*
+ * Ensure that all events have been processed and
+ * freed.
+ */
+ cfg80211_process_wdev_events(wdev);
break;
case NETDEV_PRE_UP:
if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype)))
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 5206c6844fd7..bc7430b54771 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -426,6 +426,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
struct net_device *dev, enum nl80211_iftype ntype,
u32 *flags, struct vif_params *params);
void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
+void cfg80211_process_wdev_events(struct wireless_dev *wdev);
int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 787aeaa902fe..222189b6ed53 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5656,8 +5656,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
sizeof(connect.ht_capa_mask));
if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
- if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+ if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) {
+ kfree(connkeys);
return -EINVAL;
+ }
memcpy(&connect.ht_capa,
nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
sizeof(connect.ht_capa));
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index c6e0d467f8bd..1ad04e54014c 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -680,6 +680,8 @@ static u32 map_regdom_flags(u32 rd_flags)
channel_flags |= IEEE80211_CHAN_NO_IBSS;
if (rd_flags & NL80211_RRF_DFS)
channel_flags |= IEEE80211_CHAN_RADAR;
+ if (rd_flags & NL80211_RRF_NO_OFDM)
+ channel_flags |= IEEE80211_CHAN_NO_OFDM;
return channel_flags;
}
@@ -901,7 +903,21 @@ static void handle_channel(struct wiphy *wiphy,
chan->max_antenna_gain = min(chan->orig_mag,
(int) MBI_TO_DBI(power_rule->max_antenna_gain));
chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp);
- chan->max_power = min(chan->max_power, chan->max_reg_power);
+ if (chan->orig_mpwr) {
+ /*
+ * Devices that have their own custom regulatory domain
+ * but also use WIPHY_FLAG_STRICT_REGULATORY will follow the
+ * passed country IE power settings.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY &&
+ wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY)
+ chan->max_power = chan->max_reg_power;
+ else
+ chan->max_power = min(chan->orig_mpwr,
+ chan->max_reg_power);
+ } else
+ chan->max_power = chan->max_reg_power;
}
static void handle_band(struct wiphy *wiphy,
@@ -1885,6 +1901,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy)
chan->flags = chan->orig_flags;
chan->max_antenna_gain = chan->orig_mag;
chan->max_power = chan->orig_mpwr;
+ chan->beacon_found = false;
}
}
}
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d7b672262b5f..ef35f4ef2aa6 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -723,7 +723,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
wdev->connect_keys = NULL;
}
-static void cfg80211_process_wdev_events(struct wireless_dev *wdev)
+void cfg80211_process_wdev_events(struct wireless_dev *wdev)
{
struct cfg80211_event *ev;
unsigned long flags;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5b228f97d4b3..87cd0e4d4282 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -415,8 +415,17 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me)
if (x->lft.hard_add_expires_seconds) {
long tmo = x->lft.hard_add_expires_seconds +
x->curlft.add_time - now;
- if (tmo <= 0)
- goto expired;
+ if (tmo <= 0) {
+ if (x->xflags & XFRM_SOFT_EXPIRE) {
+ /* enter hard expire without soft expire first?!
+ * setting a new date could trigger this.
+ * workarbound: fix x->curflt.add_time by below:
+ */
+ x->curlft.add_time = now - x->saved_tmo - 1;
+ tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
+ } else
+ goto expired;
+ }
if (tmo < next)
next = tmo;
}
@@ -433,10 +442,14 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me)
if (x->lft.soft_add_expires_seconds) {
long tmo = x->lft.soft_add_expires_seconds +
x->curlft.add_time - now;
- if (tmo <= 0)
+ if (tmo <= 0) {
warn = 1;
- else if (tmo < next)
+ x->xflags &= ~XFRM_SOFT_EXPIRE;
+ } else if (tmo < next) {
next = tmo;
+ x->xflags |= XFRM_SOFT_EXPIRE;
+ x->saved_tmo = tmo;
+ }
}
if (x->lft.soft_use_expires_seconds) {
long tmo = x->lft.soft_use_expires_seconds +