racket-dns-2012/network-query.rkt

342 lines
14 KiB
Racket

#lang racket/base
(require racket/set)
(require racket/match)
(require "api.rkt")
(require "codec.rkt")
(require "zonedb.rkt")
(require "../racket-matrix/os2.rkt")
(require "../racket-matrix/os2-udp.rkt")
(require "../racket-matrix/os2-timer.rkt")
(require "os2-dns.rkt")
(provide network-query
(struct-out network-reply))
;; DJB's rules for handling DNS responses. Some of these are handled
;; here (specifically, rules 2 through 5, in the action of
;; filter-dns-reply), some are handled in resolver.rkt (rule 1, in the
;; action of answer-from-zone), and some are handled in the
;; interaction between the resolver and the network-query modules
;; (rule 1 as well, the interplay between CNAME expansion and
;; recursion):
;; <blockquote>
;; When a cache receives a normal DNS response, it learns exactly one
;; of the following five pieces of information:
;;
;; 1. ``The query was not answered because the query name is an
;; alias. I need to change the query name and try again.'' This
;; applies if the answer section of the response contains a CNAME
;; record for the query name and CNAME does not match the query type.
;;
;; 2. ``The query name has no records answering the query, and is also
;; guaranteed to have no records of any other type.'' This applies if
;; the response code is NXDOMAIN and #1 doesn't apply. The amount of
;; time that this information can be cached depends on the contents of
;; the SOA record in the authority section of the response, if there
;; is one.
;;
;; 3. ``The query name has one or more records answering the query.''
;; This applies if the answer section of the response contains one or
;; more records under the query name matching the query type, and #1
;; doesn't apply, and #2 doesn't apply.
;;
;; 4. ``The query was not answered because the server does not have
;; the answer. I need to contact other servers.'' This applies if the
;; authority section of the response contains NS records, and the
;; authority section of the response does not contain SOA records, and
;; #1 doesn't apply, and #2 doesn't apply, and #3 doesn't apply. The
;; ``other servers'' are named in the NS records in the authority
;; section.
;;
;; 5. ``The query name has no records answering the query, but it may
;; have records of another type.'' This applies if #1 doesn't apply,
;; and #2 doesn't apply, and #3 doesn't apply, and #4 doesn't
;; apply. The amount of time that this information can be cached
;; depends on the contents of the SOA record in the authority section,
;; if there is one.
;;
;; This procedure requires an incredible amount of bug-prone parsing
;; for a very small amount of information. The underlying problem is
;; that DNS was designed to declare information in a human-oriented
;; format, rather than to support crucial operations in the simplest
;; possible way.
;; </blockquote>
;;---------------------------------------------------------------------------
;; DJB's djbdns logic for determining whether a response is a lame
;; referral or not is as follows (see his query.c in areas dealing
;; with the variable "flagreferral" and calls to the function
;; "log_lame"):
;;
;; If a response -
;;
;; 1. has response-code no-error (0), and
;; 2. has no CNAME records in the answer section for the domain we're
;; interested in, and
;; 3. has no records in the answer section for the domain and type
;; we're interested in, and
;; 4. has no SOA records in the authority section, and
;; 5. has at least one NS record in the authority section, and
;; 6. that NS record's name is equal to our bailiwick or is not in our
;; bailiwick,
;;
;; then it is a lame referral.
;;
;; Anything with non-zero response-code is clearly not a referral, so
;; that explains (1). If either of checks (2) and (3) fail then the
;; answer is a real, sensible answer to the question we posed. I'm not
;; 100% on why (4) is there; presumably it's to be conservative, and
;; not treat something possibly-valid as definitely-lame? Rules (5)
;; and (6) are the real heart of lameness, where a referral is given
;; to somewhere that can't be more authoritative than the responder
;; was supposed to be.
;;
;; We modify check (4) to ignore SOA records not in bailiwick, just
;; for consistency. It's correct to leave (5) and (6) alone because
;; it's incorrect for a server to refer us to anywhere at the same
;; level of the tree or further up the tree, but we do apply them to
;; every NS record rather than just the first, which is slightly
;; stricter than DJB's rule.
;;---------------------------------------------------------------------------
;; A NetworkQueryResult is a ListOf<Action>, some actions to take:
;; either involved in or resulting from completion of the network
;; query.
(define first-timeout 3) ;; seconds
;; A NetworkRequest is a (network-request UdpAddress Question
;; DomainName NEListOf<DomainName> UniqueID) representing the
;; parameters used to start and process a network query.
(struct network-request (client-socket
question
zone-origin
server-names
unique-id)
#:prefab)
;; A NetworkReply is a (network-reply UniqueID Maybe<CompleteAnswer>)
;; representing the final result of a network query.
(struct network-reply (unique-id answer) #:prefab)
;; A NetworkQueryState is a (network-query-state NetworkRequest
;; Integer Map<DomainName,ListOf<UdpAddress>> ListOf<UdpAddress>
;; Maybe<DomainName> ListOf<DomainName>), representing an in-progress
;; DNS network query.
(struct network-query-state (request
timeout
known-addresses
remaining-addresses
current-name
remaining-names)
#:prefab)
;; seconds -> Maybe<seconds>
(define (next-timeout timeout)
(case timeout
((3) 11)
((11) 45)
((45) #f)))
(define (make-dns-query-message q query-id)
(dns-message query-id
'request
'query
'non-authoritative
'not-truncated
'no-recursion-desired
'no-recursion-available
'no-error
(list q)
'()
'()
'()))
;; filter-dns-reply : Question DNSMessage DomainName
;; -> (or Maybe<CompleteAnswer> 'bad-answer 'lame-delegation)
;;
;; Filters RRs from the answer, authorities, and additional sections
;; of the passed-in `message`, returning the set of RRs surviving the
;; filter. RRs are only accepted if their `rr-name` falls in the
;; bailiwick of the given `zone-origin`. All of this only happens if
;; the passed-in message's `dns-message-response-code` is `'no-error`:
;; if it's `'name-error`, then `#f` is returned, and if it's any other
;; code, `'bad-answer` is returned.
;;
;; In cases where a CompleteAnswer would otherwise be returned, if the
;; answer is in fact a lame delegation (see notes above), then
;; 'lame-delegation is returned instead.
(define (filter-dns-reply q message zone-origin)
(case (dns-message-response-code message)
[(no-error)
(define (f l)
(list->set (filter (lambda (claim-rr) (in-bailiwick? (rr-name claim-rr) zone-origin)) l)))
;; Here's where we do the "lame referral" check. This code is
;; nice and simple (though wrong) without it. Ho hum.
(define answers (f (dns-message-answers message)))
(define unfiltered-authorities (dns-message-authorities message))
(define non-subzone-ns-rrs ;; all NS authorities not for a STRICT subzone of our zone-origin
(filter (lambda (rr) (and (eqv? (rr-type rr) 'ns)
(or (equal? (rr-name rr) zone-origin)
(not (in-bailiwick? (rr-name rr) zone-origin)))))
unfiltered-authorities))
(define authorities (f unfiltered-authorities))
(define answers-to-q ;; answers specifically to the question we asked
(set-filter (lambda (rr) (equal? (rr-name rr) (question-name q))) answers))
(define lame?
(and (set-empty? (filter-by-type answers-to-q 'cname))
(set-empty? (filter-rrs answers-to-q (question-type q) (question-class q)))
(set-empty? (filter-by-type authorities 'soa))
(not (null? non-subzone-ns-rrs))))
(if lame?
'lame-delegation
(complete-answer answers
authorities
(f (dns-message-additional message))))]
[(name-error) #f]
[else
(log-info (format "Abnormal response-code ~v in response to questions ~v"
(dns-message-response-code message)
(dns-message-questions message)))
'bad-answer]))
;; IPv4 -> String
(define (ip->host-name ip-address)
(match-define (vector a b c d) ip-address)
(format "~a.~a.~a.~a" a b c d))
(define (make-dns-address ip-address)
(udp-address (ip->host-name ip-address) 53))
;; network-query : UdpAddress Question DomainName NEListOf<DomainName> UniqueId -> BootK
(define (network-query s q zone-origin server-names unique-id)
(lambda (self-pid)
(try-next-server (network-query-state (network-request s q zone-origin server-names unique-id)
first-timeout
(hash)
'()
#f
server-names))))
(define (try-next-server w)
(match w
[(network-query-state _ #f _ _ _ _)
;; No more timeouts to try, so give up.
(on-answer w (empty-complete-answer) #f)]
[(network-query-state req timeout _ '() _ '())
;; No more addresses to try with this timeout. Refill the list
;; and bump the timeout and retry.
;; TODO: randomize ordering of servers in list. (Don't forget the initial fill.)
(try-next-server (struct-copy network-query-state w
[timeout (next-timeout timeout)]
[remaining-addresses '()]
[current-name #f]
[remaining-names (network-request-server-names req)]))]
[(network-query-state req _ known-addresses '() _ (cons current-name remaining-names))
(if (hash-has-key? known-addresses current-name)
(try-next-server (struct-copy network-query-state w
[remaining-addresses (hash-ref known-addresses
current-name)]
[current-name current-name]
[remaining-names remaining-names]))
(let ((subq (ns-question current-name (network-request-question req))))
(transition (struct-copy network-query-state w
[current-name current-name]
[remaining-names remaining-names])
(send-message subq)
(role/fresh subq-id (topic-subscriber (answered-question subq (wild)))
#:state w
[(answered-question (== subq) ans)
(define ips
(map make-dns-address (set->list (extract-addresses current-name ans))))
(sequence-actions
(try-next-server (struct-copy network-query-state w
[known-addresses (hash-set known-addresses
current-name
ips)]
[remaining-addresses ips]))
(delete-role subq-id))]))))]
[(network-query-state req timeout _ (cons current-ip remaining-ips) _ _)
(define rpc-id (gensym 'network-query/allocate-query-id))
(transition w
(send-message `(request ,rpc-id allocate-query-id))
(role (topic-subscriber `(reply ,rpc-id ,(wild)))
#:name rpc-id
#:state w
[`(reply ,(== rpc-id) ,id)
(sequence-actions (send-request (struct-copy network-query-state w
[remaining-addresses remaining-ips])
id
timeout
current-ip)
(delete-role rpc-id))]))]))
(define (on-answer w ans server-ip)
(match ans
['bad-answer ;; can come from filter-dns-reply
(try-next-server w)]
['lame-delegation ;; can come from filter-dns-reply
(match-define (network-query-state req _ known-addresses _ current-name _) w)
(match-define (network-request _ q zone-origin _ _) req)
(log-info (format "Lame delegation received from ~v (at ~v) in bailiwick ~v in response to ~v"
current-name
server-ip
zone-origin
q))
;; Actually remove the offending IP address so it's never tried again.
(try-next-server (struct-copy network-query-state w
[known-addresses (hash-update known-addresses
current-name
(lambda (addrs)
(remove server-ip addrs)))]))]
[else
(transition w
(send-message (network-reply (network-request-unique-id (network-query-state-request w))
ans)))]))
(define (send-request w query-id timeout server-ip)
(match-define (network-request s q zone-origin _ _) (network-query-state-request w))
(define query (make-dns-query-message q query-id))
(define subscription-id (list s query-id))
(define start-time (current-inexact-milliseconds))
(log-debug (format "Sending ~v ~v to ~v ~v with ~v seconds of timeout"
q query-id
zone-origin server-ip
timeout))
(transition w
(send-message (dns-request query s server-ip))
(send-message (set-timer subscription-id (* timeout 1000) 'relative))
(role (set (topic-subscriber (timer-expired subscription-id (wild)))
(topic-subscriber (dns-reply (wild) (wild) s)))
#:name subscription-id
#:state w
[(timer-expired (== subscription-id) _)
(log-debug (format "Timed out ~v ~v to ~v ~v after ~v seconds"
q query-id
zone-origin server-ip
timeout))
(sequence-actions (try-next-server w)
(delete-role subscription-id)
(send-message (list 'release-query-id query-id)))]
[(dns-reply reply-message source (== s))
;; TODO: maybe receive only specifically from the queried IP address?
(log-debug
(format
"Answer to ~v from ~v ~v in ~v ms~n-- Answers: ~v~n-- Authorities: ~v~n-- Additional: ~v"
q zone-origin server-ip
(inexact->exact (round (- (current-inexact-milliseconds) start-time)))
(dns-message-answers reply-message)
(dns-message-authorities reply-message)
(dns-message-additional reply-message)))
(if (not (= (dns-message-id reply-message) (dns-message-id query)))
(transition w)
(sequence-actions (on-answer w
(filter-dns-reply q reply-message zone-origin)
server-ip)
(delete-role subscription-id)
(send-message (list 'release-query-id query-id))))])))