scala> var textFile=sc.textFile(
"/usr/hadoop/inpatient.txt"
)
scala> textFile.first()
res1: String =
"第一行内容"
textFile.first().
split
(
","
)(0)
res2: String =
"0000718165"
textFile.first().
split
(
","
)(5)
res3: String =
"100.01"
scala> textFile.count()
res4: Long = 115411
textFile.filter(line=>line.contains(
"ICU"
)).count()
res5: Long = 912
var lineLengths = textFile.map(s=>s.length)
var totalLenght = lineLengths.reduce((a,b)=>a+b)
totalLenght: Int = 32859905
textFile.map(line=>
if
(line.
split
(
","
).size==30) line.
split
(
","
)(23).replace(
"\""
,
""
)
else
"0"
).reduce((a,b)=>
if
(a.toDouble>b.toDouble) a
else
b)
res6: String = 300
@SerialVersionUID(100L)
class PATIENT(var PATIENT_NO : String,var NAME : String,var SEX_CODE : String,var BIRTHDATE : String,var BALANCE_COST : String) extends Serializable
var p=new PATIENT(
"PATIENT_NO"
,
"NAME"
,
"SEX_CODE"
,
"BIRTHDATE"
,
"BALANCE_COST"
)
def mapFunc(line:String) : PATIENT = {
var cols=line.
split
(
","
)
return
new PATIENT(cols(0),cols(1),cols(2),cols(3),cols(4))
}
textFile.filter(line=>line.
split
(
","
).size==30).map(mapFunc).reduce((a,b)=>
if
(a.BALANCE_COST.replace(
"\""
,
""
).toDouble>b.BALANCE_COST.replace(
"\""
,
""
).toDouble) a
else
b).BALANCE_COST
textFile.filter(line=>line.
split
(
","
).size==30).map(mapFunc).filter(p=>p.SEX_CODE==
"\"M\""
).reduce((a,b)=>
if
(a.BALANCE_COST.replace(
"\""
,
""
).toDouble>b.BALANCE_COST.replace(
"\""
,
""
).toDouble) a
else
b).BALANCE_COST
textFile.filter(line=>line.
split
(
","
).size==30).map(mapFunc).filter(p=>p.SEX_CODE==
"\"F\""
).reduce((a,b)=>
if
(a.BALANCE_COST.replace(
"\""
,
""
).toDouble>b.BALANCE_COST.replace(
"\""
,
""
).toDouble) a
else
b).BALANCE_COST
scala>
exit