From 1c95df85ca49640576de2f0a850925957b547b84 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 8 Dec 2012 19:43:21 +0000 Subject: inet_diag: fix oops for IPv4 AF_INET6 TCP SYN-RECV state Fix inet_diag to be aware of the fact that AF_INET6 TCP connections instantiated for IPv4 traffic and in the SYN-RECV state were actually created with inet_reqsk_alloc(), instead of inet6_reqsk_alloc(). This means that for such connections inet6_rsk(req) returns a pointer to a random spot in memory up to roughly 64KB beyond the end of the request_sock. With this bug, for a server using AF_INET6 TCP sockets and serving IPv4 traffic, an inet_diag user like `ss state SYN-RECV` would lead to inet_diag_fill_req() causing an oops or the export to user space of 16 bytes of kernel memory as a garbage IPv6 address, depending on where the garbage inet6_rsk(req) pointed. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 53 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 0c34bfabc11..16cfa42cfd9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,6 +44,10 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */ + struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */ +#endif }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -596,6 +600,36 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } +/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses + * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. + */ +static inline void inet_diag_req_addrs(const struct sock *sk, + const struct request_sock *req, + struct inet_diag_entry *entry) +{ + struct inet_request_sock *ireq = inet_rsk(req); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + if (req->rsk_ops->family == AF_INET6) { + entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32; + entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32; + } else if (req->rsk_ops->family == AF_INET) { + ipv6_addr_set_v4mapped(ireq->loc_addr, + &entry->saddr_storage); + ipv6_addr_set_v4mapped(ireq->rmt_addr, + &entry->daddr_storage); + entry->saddr = entry->saddr_storage.s6_addr32; + entry->daddr = entry->daddr_storage.s6_addr32; + } + } else +#endif + { + entry->saddr = &ireq->loc_addr; + entry->daddr = &ireq->rmt_addr; + } +} + static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, struct request_sock *req, struct user_namespace *user_ns, @@ -637,8 +671,10 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, r->idiag_inode = 0; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = inet6_rsk(req)->loc_addr; - *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; + struct inet_diag_entry entry; + inet_diag_req_addrs(sk, req, &entry); + memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); + memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr)); } #endif @@ -691,18 +727,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, continue; if (bc) { - entry.saddr = -#if IS_ENABLED(CONFIG_IPV6) - (entry.family == AF_INET6) ? - inet6_rsk(req)->loc_addr.s6_addr32 : -#endif - &ireq->loc_addr; - entry.daddr = -#if IS_ENABLED(CONFIG_IPV6) - (entry.family == AF_INET6) ? - inet6_rsk(req)->rmt_addr.s6_addr32 : -#endif - &ireq->rmt_addr; + inet_diag_req_addrs(sk, req, &entry); entry.dport = ntohs(ireq->rmt_port); if (!inet_diag_bc_run(bc, &entry)) -- cgit v1.2.3 From 405c005949e47b6e91359159c24753519ded0c67 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 8 Dec 2012 19:43:22 +0000 Subject: inet_diag: validate byte code to prevent oops in inet_diag_bc_run() Add logic to validate INET_DIAG_BC_S_COND and INET_DIAG_BC_D_COND operations. Previously we did not validate the inet_diag_hostcond, address family, address length, and prefix length. So a malicious user could make the kernel read beyond the end of the bytecode array by claiming to have a whole inet_diag_hostcond when the bytecode was not long enough to contain a whole inet_diag_hostcond of the given address family. Or they could make the kernel read up to about 27 bytes beyond the end of a connection address by passing a prefix length that exceeded the length of addresses of the given family. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 16cfa42cfd9..529747d07a2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -513,6 +513,44 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* Validate an inet_diag_hostcond. */ +static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + int addr_len; + struct inet_diag_hostcond *cond; + + /* Check hostcond space. */ + *min_len += sizeof(struct inet_diag_hostcond); + if (len < *min_len) + return false; + cond = (struct inet_diag_hostcond *)(op + 1); + + /* Check address family and address length. */ + switch (cond->family) { + case AF_UNSPEC: + addr_len = 0; + break; + case AF_INET: + addr_len = sizeof(struct in_addr); + break; + case AF_INET6: + addr_len = sizeof(struct in6_addr); + break; + default: + return false; + } + *min_len += addr_len; + if (len < *min_len) + return false; + + /* Check prefix length (in bits) vs address length (in bytes). */ + if (cond->prefix_len > 8 * addr_len) + return false; + + return true; +} + static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { const void *bc = bytecode; @@ -520,18 +558,22 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) while (len > 0) { const struct inet_diag_bc_op *op = bc; + int min_len = sizeof(struct inet_diag_bc_op); //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { - case INET_DIAG_BC_AUTO: case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: + if (!valid_hostcond(bc, len, &min_len)) + return -EINVAL; + /* fall through */ + case INET_DIAG_BC_AUTO: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: case INET_DIAG_BC_JMP: - if (op->no < 4 || op->no > len + 4 || op->no & 3) + if (op->no < min_len || op->no > len + 4 || op->no & 3) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len - op->no)) @@ -542,7 +584,7 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) default: return -EINVAL; } - if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) + if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) return -EINVAL; bc += op->yes; len -= op->yes; -- cgit v1.2.3 From f67caec9068cee426ec23cf9005a1dee2ecad187 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 8 Dec 2012 19:43:23 +0000 Subject: inet_diag: avoid unsafe and nonsensical prefix matches in inet_diag_bc_run() Add logic to check the address family of the user-supplied conditional and the address family of the connection entry. We now do not do prefix matching of addresses from different address families (AF_INET vs AF_INET6), except for the previously existing support for having an IPv4 prefix match an IPv4-mapped IPv6 address (which this commit maintains as-is). This change is needed for two reasons: (1) The addresses are different lengths, so comparing a 128-bit IPv6 prefix match condition to a 32-bit IPv4 connection address can cause us to unwittingly walk off the end of the IPv4 address and read garbage or oops. (2) The IPv4 and IPv6 address spaces are semantically distinct, so a simple bit-wise comparison of the prefixes is not meaningful, and would lead to bogus results (except for the IPv4-mapped IPv6 case, which this commit maintains). Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 529747d07a2..95f1a458371 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -432,25 +432,31 @@ static int inet_diag_bc_run(const struct nlattr *_bc, break; } - if (cond->prefix_len == 0) - break; - if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; + if (cond->family != AF_UNSPEC && + cond->family != entry->family) { + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr + 3, + cond->addr, + cond->prefix_len)) + break; + } + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; if (bitstring_match(addr, cond->addr, cond->prefix_len)) break; - if (entry->family == AF_INET6 && - cond->family == AF_INET) { - if (addr[0] == 0 && addr[1] == 0 && - addr[2] == htonl(0xffff) && - bitstring_match(addr + 3, cond->addr, - cond->prefix_len)) - break; - } yes = 0; break; } -- cgit v1.2.3 From 5e1f54201cb481f40a04bc47e1bc8c093a189e23 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 9 Dec 2012 11:09:54 +0000 Subject: inet_diag: validate port comparison byte code to prevent unsafe reads Add logic to verify that a port comparison byte code operation actually has the second inet_diag_bc_op from which we read the port for such operations. Previously the code blindly referenced op[1] without first checking whether a second inet_diag_bc_op struct could fit there. So a malicious user could make the kernel read 4 bytes beyond the end of the bytecode array by claiming to have a whole port comparison byte code (2 inet_diag_bc_op structs) when in fact the bytecode was not long enough to hold both. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 95f1a458371..e23e16dc501 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -557,6 +557,17 @@ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, return true; } +/* Validate a port comparison operator. */ +static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, + int len, int *min_len) +{ + /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ + *min_len += sizeof(struct inet_diag_bc_op); + if (len < *min_len) + return false; + return true; +} + static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { const void *bc = bytecode; @@ -572,24 +583,30 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) case INET_DIAG_BC_D_COND: if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; - /* fall through */ - case INET_DIAG_BC_AUTO: + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: - case INET_DIAG_BC_JMP: - if (op->no < min_len || op->no > len + 4 || op->no & 3) - return -EINVAL; - if (op->no < len && - !valid_cc(bytecode, bytecode_len, len - op->no)) + if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; default: return -EINVAL; } + + if (op->code != INET_DIAG_BC_NOP) { + if (op->no < min_len || op->no > len + 4 || op->no & 3) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len - op->no)) + return -EINVAL; + } + if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) return -EINVAL; bc += op->yes; -- cgit v1.2.3 From 1bf3751ec90cc3174e01f0d701e8449ce163d113 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 9 Dec 2012 23:41:06 +0000 Subject: ipv4: ip_check_defrag must not modify skb before unsharing ip_check_defrag() might be called from af_packet within the RX path where shared SKBs are used, so it must not modify the input SKB before it has unshared it for defragmentation. Use skb_copy_bits() to get the IP header and only pull in everything later. The same is true for the other caller in macvlan as it is called from dev->rx_handler which can also get a shared SKB. Reported-by: Eric Leblond Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 448e6854682..8d5cc75dac8 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -707,28 +707,27 @@ EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) { - const struct iphdr *iph; + struct iphdr iph; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) + if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) return skb; - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) + if (iph.ihl < 5 || iph.version != 4) return skb; - if (!pskb_may_pull(skb, iph->ihl*4)) - return skb; - iph = ip_hdr(skb); - len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl * 4)) + + len = ntohs(iph.tot_len); + if (skb->len < len || len < (iph.ihl * 4)) return skb; - if (ip_is_fragment(ip_hdr(skb))) { + if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { + if (!pskb_may_pull(skb, iph.ihl*4)) + return skb; if (pskb_trim_rcsum(skb, len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); -- cgit v1.2.3